In [None]:
import pandas as pd
import random
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
import joblib
import os

In [7]:
def handle_missing_data(test_data, reference_data, model_features):
    """Handle missing columns and missing values in the test data by sampling from reference data."""
    for col in model_features:
        if col not in test_data.columns:
            # Sample from reference data if column is missing
            sampled_value = reference_data[col].dropna().sample(1).values[0]
            test_data[col] = sampled_value
        else:
            # Handle missing values in existing columns
            if test_data[col].isnull().sum() > 0:
                sampled_values = reference_data[col].dropna()
                if not sampled_values.empty:
                    test_data[col] = test_data[col].apply(
                        lambda x: random.choice(sampled_values) if pd.isna(x) else x
                    )
                else:
                    test_data[col] = test_data[col].fillna(0)  # If no data to sample from, fill with 0
    
    return test_data

In [6]:
def match_model_features(df, model_features, training_df=None):
    """Ensure the dataframe has exactly the model_features, adding missing ones with mean values and dropping extras."""
    # Add missing columns (with mean values from training_df if provided)
    missing_cols = set(model_features) - set(df.columns)
    if missing_cols:
        print(f"Adding missing columns: {missing_cols}")
        for col in missing_cols:
            # If training_df is provided, use its mean value for the missing columns
            if training_df is not None and col in training_df.columns:
                df[col] = training_df[col].mean()
            else:
                df[col] = np.nan  # In case training_df is not provided, fill with NaN
    # Drop extra columns not in model_features
    extra_cols = set(df.columns) - set(model_features)
    if extra_cols:
        print(f"Dropping extra columns: {extra_cols}")
        df.drop(columns=extra_cols, inplace=True)
    
    # Ensure the columns are ordered the same as in model_features
    df = df[model_features]
    
    return df



In [9]:

model_features = [
    'Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
    'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
    'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min',
    'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s',
    'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Std',
    'Fwd IAT Max', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
    'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
    'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Mean',
    'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
    'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'Down/Up Ratio', 'Fwd Byts/b Avg',
    'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg',
    'Bwd Blk Rate Avg', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts',
    'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean',
    'Idle Std'
]
training_df=pd.read_csv(r"D:\4th semester\SE\project\Datasets\step_4.csv")
test_data = pd.DataFrame({
    'Protocol': [6.0, 17.0, np.nan],
    'Tot Bwd Pkts': [12.0, np.nan, 30.0],
    'Fwd Pkt Len Max': [np.nan, 1200.0, 500.0],
    # Not including all 63 features
})
from copy import deepcopy
test_copy = deepcopy(test_data)
test_copy = match_model_features(test_copy, model_features, training_df)
test_copy = handle_missing_data(test_copy, training_df, model_features)

print(test_copy.head())
print("Shape after processing:", test_copy.shape)
print("Missing values remaining:", test_copy.isnull().sum().sum())

Adding missing columns: {'Bwd IAT Std', 'Fwd Pkts/b Avg', 'Init Bwd Win Byts', 'Active Std', 'Flow Byts/s', 'Bwd Pkt Len Mean', 'Bwd URG Flags', 'Fwd Seg Size Min', 'Init Fwd Win Byts', 'Flow IAT Min', 'Active Max', 'Bwd Pkt Len Std', 'Bwd Pkt Len Max', 'Fwd PSH Flags', 'Flow IAT Std', 'Flow Duration', 'Bwd Header Len', 'Active Min', 'PSH Flag Cnt', 'Flow Pkts/s', 'Timestamp', 'Bwd IAT Mean', 'Fwd Header Len', 'Fwd IAT Max', 'Flow IAT Max', 'Fwd Byts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Pkts/s', 'Flow IAT Mean', 'Pkt Len Var', 'Bwd Pkt Len Min', 'URG Flag Cnt', 'Idle Mean', 'Bwd Byts/b Avg', 'Fwd Pkt Len Min', 'TotLen Bwd Pkts', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Fwd URG Flags', 'Down/Up Ratio', 'Bwd IAT Tot', 'Idle Std', 'Tot Fwd Pkts', 'Bwd IAT Max', 'Pkt Len Std', 'FIN Flag Cnt', 'Pkt Len Mean', 'Bwd Pkts/b Avg', 'Active Mean', 'CWE Flag Count', 'Fwd Pkts/s', 'RST Flag Cnt', 'Fwd Act Data Pkts', 'Dst Port', 'Bwd Blk Rate Avg', 'Bwd PSH Flags', 'TotLen Fwd Pkts', 'ACK Flag Cnt', 'Bw

In [8]:
from sklearn.preprocessing import MinMaxScaler
import joblib

scaler = joblib.load(r'D:\4th semester\SE\project\Models\min_max_scaler.pkl')  # Your saved fitted scaler
new_data_scaled = scaler.transform(test_copy) 


print("Min values:", np.min(new_data_scaled, axis=0))
print("Max values:", np.max(new_data_scaled, axis=0))

Min values: [7.04663125e-02 0.00000000e+00 5.75235799e-01 5.30563001e-02
 1.51706775e-02 1.08695652e-02 1.36953433e-02 1.53659868e-03
 0.00000000e+00 6.06973250e-02 9.16103324e-02 7.56640136e-02
 2.14624648e-01 6.01961433e-02 6.90170425e-02 1.10310047e-01
 3.27926526e-03 1.35329553e-01 1.56701823e-02 5.13169950e-03
 2.35574726e-02 1.37968138e-02 4.84191034e-03 2.20493071e-02
 3.03179768e-02 1.89789780e-02 1.59376284e-02 1.21739333e-02
 3.04249134e-03 2.26961409e-02 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.99161911e-02 1.28028598e-02 7.03200066e-02
 1.30019092e-01 7.30919944e-02 1.20860664e-01 4.91589665e-02
 0.00000000e+00 4.05223238e-02 5.21539476e-01 2.64041927e-01
 1.25382629e-01 0.00000000e+00 1.64617737e-01 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.63025862e-01 9.61599599e-02 6.62088059e-02
 4.88107471e-01 3.61600030e-03 4.57198471e-03 6.21860728e-03
 2.99223768e-03 9.52763017e-03 2.44892671e-04]
Max values: [7.04663125e-0

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# List of model features
model_features = [
    'Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
    'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
    'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min',
    'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s',
    'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Std',
    'Fwd IAT Max', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
    'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
    'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Mean',
    'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
    'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'Down/Up Ratio', 'Fwd Byts/b Avg',
    'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg',
    'Bwd Blk Rate Avg', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts',
    'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean',
    'Idle Std'
]

# Generate 3 dummy rows
dummy_data = []

for _ in range(3):
    row = []
    for feature in model_features:
        if feature == 'Timestamp':
            # Generate a random datetime and convert to string in expected format
            time = datetime(2024, 1, 1, 12, 0, 0) + timedelta(minutes=np.random.randint(0, 1000))
            row.append(time.strftime('%d/%m/%Y %H:%M:%S'))
        elif feature in ['Protocol', 'Dst Port']:
            row.append(np.random.randint(1, 255))
        elif 'Flag' in feature or 'Ratio' in feature or 'CWE' in feature:
            row.append(np.random.randint(0, 2))
        elif 'Win' in feature or 'Pkt' in feature or 'Byts' in feature:
            row.append(np.random.randint(1, 10000))
        else:
            row.append(np.round(np.random.uniform(0.01, 1000), 2))
    dummy_data.append(row)

# Create DataFrame
df_dummy = pd.DataFrame(dummy_data, columns=model_features)

# Save to CSV (optional)
df_dummy.to_csv("dummy_model_features.csv", index=False)
print("✅ Dummy dataset saved to dummy_model_features.csv")


✅ Dummy dataset saved to dummy_model_features.csv


In [9]:
training_df=pd.read_csv(r"D:\4th semester\SE\project\Datasets\step_4.csv")

from copy import deepcopy
df_dummy  = deepcopy(df_dummy )
df_dummy  = match_model_features(df_dummy , model_features, training_df)
df_dummy = handle_missing_data(df_dummy , training_df, model_features)

print(df_dummy.head())
print("Shape after processing:", df_dummy.shape)
print("Missing values remaining:", df_dummy.isnull().sum().sum())

   Dst Port  Protocol            Timestamp  Flow Duration  Tot Fwd Pkts  \
0        86       249  01/01/2024 17:09:00         784.90          9658   
1       189        57  01/01/2024 13:45:00         858.30          6305   
2       253        74  01/01/2024 16:41:00         266.63          4226   

   Tot Bwd Pkts  TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  \
0          7438             3808             3802             2552   
1           584              947             7929             3923   
2          4619             8255             4254             9351   

   Fwd Pkt Len Min  ...  Init Fwd Win Byts  Init Bwd Win Byts  \
0              717  ...               7726               9056   
1             5762  ...               9359               8541   
2             6319  ...               7195               9246   

   Fwd Act Data Pkts  Fwd Seg Size Min  Active Mean  Active Std  Active Max  \
0               1488            711.18       875.05      198.77      289.26   

In [None]:
def convert_timestamp(df):
    """Convert 'Timestamp' column to seconds."""
    if 'Timestamp' in df.columns:
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')
        df['Timestamp'] = (df['Timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
    return df

df_dummy=convert_timestamp(df_dummy)

print(df_dummy.head)


<bound method NDFrame.head of    Dst Port  Protocol   Timestamp  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  \
0        86       249  1704128940         784.90          9658          7438   
1       189        57  1704116700         858.30          6305           584   
2       253        74  1704127260         266.63          4226          4619   

   TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  ...  \
0             3808             3802             2552              717  ...   
1              947             7929             3923             5762  ...   
2             8255             4254             9351             6319  ...   

   Init Fwd Win Byts  Init Bwd Win Byts  Fwd Act Data Pkts  Fwd Seg Size Min  \
0               7726               9056               1488            711.18   
1               9359               8541               6783            690.65   
2               7195               9246               4482             64.12   

   Active Mean 

In [12]:
def scale_features(df, scaler_save_path):
    """Scale features except 'Label' using MinMaxScaler and save scaler."""
    feature_cols = [col for col in df.columns if col != 'Label']
    scaler = MinMaxScaler()
    df[feature_cols] = scaler.fit_transform(df[feature_cols])
    joblib.dump(scaler, scaler_save_path)
    print(f"Scaler saved to: {scaler_save_path}")
    return df


df_dummy=scale_features(df_dummy,r"D:\4th semester\SE\project\Models\min_max_scaler.pkl")

print(df_dummy.head)

Scaler saved to: D:\4th semester\SE\project\Models\min_max_scaler.pkl
<bound method NDFrame.head of    Dst Port  Protocol  Timestamp  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  \
0  0.000000  1.000000   1.000000       0.875944      1.000000      1.000000   
1  0.616766  0.000000   0.000000       1.000000      0.382732      0.000000   
2  1.000000  0.088542   0.862745       0.000000      0.000000      0.588707   

   TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  ...  \
0         0.391489         0.000000         0.000000         0.000000  ...   
1         0.000000         1.000000         0.201647         0.900571  ...   
2         1.000000         0.109523         1.000000         1.000000  ...   

   Init Fwd Win Byts  Init Bwd Win Byts  Fwd Act Data Pkts  Fwd Seg Size Min  \
0           0.245379           0.730496           0.000000          1.000000   
1           1.000000           0.000000           1.000000          0.968272   
2           0.000000          

In [14]:
import joblib

# Attempt to load the model
    
try:
    model = joblib.load(r'D:\4th semester\SE\project\Models_final\balanced_FTP-BruteForce_model.pkl')  # Replace with your actual model file name
    print("✅ Model loaded successfully!")
    
    # Optional: Check model type or summary
    print("Model details:", model)

except FileNotFoundError:
    print("❌ Model file not found.")
except Exception as e:
    print("❌ Error loading model:", str(e))



try:
    model = joblib.load(r'D:\4th semester\SE\project\Models_final\balanced_SSH-BruteForce_model.pkl')  # Replace with your actual model file name
    print("✅ Model loaded successfully!")
    
    # Optional: Check model type or summary
    print("Model details:", model)

except FileNotFoundError:
    print("❌ Model file not found.")
except Exception as e:
    print("❌ Error loading model:", str(e))


✅ Model loaded successfully!
Model details: RandomForestClassifier(random_state=42)
✅ Model loaded successfully!
Model details: RandomForestClassifier(random_state=42)
