In [9]:
import pandas as pd
import numpy as np
import joblib
import random


In [14]:
def transform_new_data(test_data_path, reference_data_path, model_features, scaler_path, output_transformed_path):
    # Load the reference data (training dataset) and the test data (new dataset)
    reference_data = pd.read_csv(reference_data_path)
    test_data = pd.read_csv(test_data_path)
    
    # Step 1: Ensure all the required features are present in the test data
    missing_cols = set(model_features) - set(test_data.columns)
    for col in missing_cols:
        # If a feature is missing, sample a value from the reference data (training data)
        sampled_value = reference_data[col].dropna().sample(1).values[0]
        test_data[col] = sampled_value

    # Step 2: Reorder the columns to match the training feature order
    test_data = test_data[model_features]
    
    # Step 3: Handle missing values in the test data for features that exist
    for col in model_features:
        if col in test_data.columns and test_data[col].isnull().sum() > 0:
            # Sample from the reference data if values are missing, otherwise fill with 0
            sampled_values = reference_data[col].dropna()
            if not sampled_values.empty:
                test_data[col] = test_data[col].apply(lambda x: random.choice(sampled_values) if pd.isna(x) else x)
            else:
                test_data[col] = test_data[col].fillna(0)  # Fill with 0 if no sampled values

    # Step 4: Handle inf and NaN values in the dataset
    test_data.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinity values with NaN
    test_data.fillna(test_data.mean(), inplace=True)  # Fill NaNs with the mean of the column
    test_data = test_data.clip(upper=1e10)  # Clip extreme values to prevent outliers
    test_data = test_data.astype(np.float32)  # Convert to appropriate type for processing

    # Step 5: Load the pre-trained scaler and scale the data
    scaler = joblib.load(scaler_path)  # Load the MinMaxScaler from file
    test_data_scaled = scaler.transform(test_data)  # Apply scaling to the test data
    
    # Step 6: Save the transformed (scaled) test data
    transformed_df = pd.DataFrame(test_data_scaled, columns=model_features)
    transformed_df.to_csv(output_transformed_path, index=False)

    print(f"✅ Transformed data saved to {output_transformed_path}")

In [15]:
# Define the features expected by the model (same as used during training)
model_features = [
    'Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
    'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
    'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min',
    'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s',
    'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Std',
    'Fwd IAT Max', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
    'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
    'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Mean',
    'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
    'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'Down/Up Ratio', 'Fwd Byts/b Avg',
    'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg',
    'Bwd Blk Rate Avg', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts',
    'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean',
    'Idle Std'
]

# Call the function with paths to your data
transform_new_data(
    test_data_path=r"D:\4th semester\SE\project\Datasets\benign_ftp_bruteforce.csv",  # New data you want to process
    reference_data_path=r"D:\4th semester\SE\project\Datasets\balanced_FTP-BruteForce.csv",  # Reference training data used to train the model
    model_features=model_features,  # List of features the model expects
    scaler_path=r'D:\4th semester\SE\project\Models\min_max_scaler.pkl',  # Pre-trained scaler for consistent scaling
    output_transformed_path=r'D:\4th semester\SE\project\Datasets\transformed_nigger.csv'  # Output file for transformed data
)


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Bwd Seg Size Avg
- ECE Flag Cnt
- Fwd IAT Mean
- Fwd IAT Min
- Fwd IAT Tot
- ...
