In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', None, 'display.max_columns', None)

In [None]:

file_path = r"C:\Users\user\.cache\kagglehub\datasets\ogguy11\apt-detection\versions\1\02-14-2018.csv"
df = pd.read_csv(file_path)

# Select only numeric columns to avoid TypeError
numeric_df = df.select_dtypes(include=[np.number])

# Count +inf and -inf values
count_inf = np.isposinf(numeric_df).sum().sum()
count_neg_inf = np.isneginf(numeric_df).sum().sum()
print(f"+Inf count: {count_inf}")
print(f"-Inf count: {count_neg_inf}")

# Replace +inf and -inf with NaN across entire DataFrame
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any NaN values
df.dropna(inplace=True)

# Save the cleaned dataset
output_path = r"D:\4th semester\SE\project\Datasets_final\step1.csv"
df.to_csv(output_path, index=False)
print(f"✅ Cleaned dataset saved at: {output_path}")


+Inf count: 3094
-Inf count: 0
✅ Cleaned dataset saved at: D:\4th semester\SE\project\Datasets_final\step1.csv


In [5]:
# Load your dataset
input_path =  r"D:\4th semester\SE\project\Datasets_final\step1.csv" # Replace with your actual path
df = pd.read_csv(input_path)

# Display the first few rows
print("📄 Preview of the dataset:\n")
print(df.head())

# Display dataset info
print("\nℹ️ Dataset info:\n")
(df.info())

📄 Preview of the dataset:

   Dst Port  Protocol            Timestamp  Flow Duration  Tot Fwd Pkts  \
0         0         0  14/02/2018 08:31:01      112641719             3   
1         0         0  14/02/2018 08:33:50      112641466             3   
2         0         0  14/02/2018 08:36:39      112638623             3   
3        22         6  14/02/2018 08:40:13        6453966            15   
4        22         6  14/02/2018 08:40:23        8804066            14   

   Tot Bwd Pkts  TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  \
0             0                0                0                0   
1             0                0                0                0   
2             0                0                0                0   
3            10             1239             2273              744   
4            11             1143             2209              744   

   Fwd Pkt Len Min  Fwd Pkt Len Mean  Fwd Pkt Len Std  Bwd Pkt Len Max  \
0                0         

In [6]:
df = pd.read_csv(input_path)

# Count the values in the 'Label' column
count = df['Label'].value_counts()

# Print the result
print(f"Value counts for 'Label' column:\n{count}")


Value counts for 'Label' column:
Label
Benign            663808
FTP-BruteForce    193354
SSH-Bruteforce    187589
Name: count, dtype: int64


In [7]:
df = pd.read_csv(input_path)

# Convert 'Timestamp' to DateTime format
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

# Now convert 'Timestamp' to seconds since Unix epoch
df['Timestamp'] = (df['Timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# Save the cleaned dataset to a new file
output_path = r"D:\4th semester\SE\project\Datasets_final\step2.csv"  # Replace with your desired output file path
df.to_csv(output_path, index=False)

In [8]:
file_path = r"D:\4th semester\SE\project\Datasets_final\step2.csv"

df = pd.read_csv(file_path)

# Step 1: Convert object columns (except 'Label') to numeric, replace errors with NaN
for col in df.columns:
    if df[col].dtype == 'object' and col != 'Label':
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Step 2: Count NaN values in each column
count_NA = df.isna().sum()

# Print the results
print(f"Missing Values Count (NaN) per Column:\n{count_NA}")


Missing Values Count (NaN) per Column:
Dst Port             0
Protocol             0
Timestamp            0
Flow Duration        0
Tot Fwd Pkts         0
Tot Bwd Pkts         0
TotLen Fwd Pkts      0
TotLen Bwd Pkts      0
Fwd Pkt Len Max      0
Fwd Pkt Len Min      0
Fwd Pkt Len Mean     0
Fwd Pkt Len Std      0
Bwd Pkt Len Max      0
Bwd Pkt Len Min      0
Bwd Pkt Len Mean     0
Bwd Pkt Len Std      0
Flow Byts/s          0
Flow Pkts/s          0
Flow IAT Mean        0
Flow IAT Std         0
Flow IAT Max         0
Flow IAT Min         0
Fwd IAT Tot          0
Fwd IAT Mean         0
Fwd IAT Std          0
Fwd IAT Max          0
Fwd IAT Min          0
Bwd IAT Tot          0
Bwd IAT Mean         0
Bwd IAT Std          0
Bwd IAT Max          0
Bwd IAT Min          0
Fwd PSH Flags        0
Bwd PSH Flags        0
Fwd URG Flags        0
Bwd URG Flags        0
Fwd Header Len       0
Bwd Header Len       0
Fwd Pkts/s           0
Bwd Pkts/s           0
Pkt Len Min          0
Pkt Len Max       

In [9]:
from scipy import stats

# === Load the dataset ===
df = pd.read_csv(file_path)

# === Function to filter outliers using Z-score ===
def filter_outliers_zscore(data, threshold):
    z_scores = np.abs(stats.zscore(data))
    outlier_mask = (z_scores > threshold).any(axis=1)
    return data[~outlier_mask], data[outlier_mask]

# === Threshold ===
threshold = 7

# === Loop through the columns and filter outliers ===
filtered_cols = []
removed_outliers = []

for col in df.columns:
    if col != 'Label':
        filtered_col, outliers = filter_outliers_zscore(df[[col]], threshold)
        filtered_cols.append(filtered_col)
        removed_outliers.append(outliers)

# === Combine filtered and outlier dataframes ===
df_filtered = pd.concat(filtered_cols, axis=1)
df_outliers = pd.concat(removed_outliers, axis=1)

# === Output Summary ===
print(f'\nOriginal Data Shape: {df.shape}')
print('Outlier removal summary:')
print(f'{df_outliers.shape[0]} outlier rows would be removed\n')

# === Print previews ===
print('\nOriginal dataframe:')
print(df.head())

# Assign filtered numeric columns back
columns = [col for col in df.columns if col != 'Label']
df.loc[:, columns] = df_filtered.loc[:, columns]

# === Save the filtered DataFrame to a new file ===
output_path = r"D:\4th semester\SE\project\Datasets_final\step_3.csv"
df.to_csv(output_path, index=False)

print(f"\n💾 Filtered dataset saved to: {output_path}")

# === Print final filtered dataframe ===
print('\nFiltered dataframe:')
print(df.head())

# === Print removed outliers ===
print('\nRemoved outliers:')
print(df_outliers.head())

# === Show which attack types had rows removed (if any) ===
values_orig = df.loc[df.index.isin(df_outliers.index), 'Label']
print(f'\nAttack types removed in outliers:\n{values_orig.value_counts()}')



Original Data Shape: (1044751, 80)
Outlier removal summary:
33654 outlier rows would be removed


Original dataframe:
   Dst Port  Protocol   Timestamp  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  \
0         0         0  1518597061      112641719             3             0   
1         0         0  1518597230      112641466             3             0   
2         0         0  1518597399      112638623             3             0   
3        22         6  1518597613        6453966            15            10   
4        22         6  1518597623        8804066            14            11   

   TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  \
0                0                0                0                0   
1                0                0                0                0   
2                0                0                0                0   
3             1239             2273              744                0   
4             1143             2209

In [10]:
file_path = r"D:\4th semester\SE\project\Datasets_final\step_3.csv"

df = pd.read_csv(file_path)

# === Drop NaN values from the dataset ===
df.dropna(inplace=True)

# === Output Path ===
output_path = r"D:\4th semester\SE\project\Datasets_final\step_4.csv"  # 🔁 Specify your desired output path

# === Save the cleaned dataset ===
df.to_csv(output_path, index=False)

# === Print the shape of the cleaned dataset ===
print(f"Cleaned dataset shape: {df.shape}")
print(f"Cleaned dataset saved at: {output_path}")

Cleaned dataset shape: (1011097, 80)
Cleaned dataset saved at: D:\4th semester\SE\project\Datasets_final\step_4.csv


In [12]:
file_path= r"D:\4th semester\SE\project\Datasets_final\step_4.csv" 

df = pd.read_csv(file_path)

# === Define columns to exclude 'Label' ===
columns = [col for col in df.columns if col != 'Label']

# === Calculate correlation matrix ===
corr_matrix = df[columns].corr().abs()

# === Define threshold for high correlation ===
threshold = 0.99

# === Find features with high correlation ===
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

# === Output which features will be dropped ===
print(f"The following {len(to_drop)} features will be dropped due to high correlation: {to_drop}")

# === Drop the highly correlated features ===
df = df.drop(to_drop, axis=1)

# === Output Path ===
output_path = r"D:\4th semester\SE\project\Datasets_final\step_5.csv"  # 🔁 Specify your desired output path

# === Save the modified dataset ===
df.to_csv(output_path, index=False)

# === Print the first few rows of the modified dataset ===
print(f"Modified dataset saved at: {output_path}")
print(f"Modified dataset shape: {df.shape}")
print(df.head())


The following 16 features will be dropped due to high correlation: ['Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Min', 'Pkt Len Min', 'Pkt Len Max', 'SYN Flag Cnt', 'ECE Flag Cnt', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts', 'Idle Max', 'Idle Min']
Modified dataset saved at: D:\4th semester\SE\project\Datasets_final\step_5.csv
Modified dataset shape: (1011097, 64)
   Dst Port  Protocol     Timestamp  Flow Duration  Tot Fwd Pkts  \
0         0         0  1.518597e+09    112641719.0           3.0   
1         0         0  1.518597e+09    112641466.0           3.0   
2         0         0  1.518597e+09    112638623.0           3.0   
3        22         6  1.518598e+09      6453966.0          15.0   
4        22         6  1.518598e+09      8804066.0          14.0   

   Tot Bwd Pkts  TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  \
0           0.0              0.0              0.0              0.0 

In [13]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib  # To save the scaler

# === File Path ===
file_path = r"D:\4th semester\SE\project\Datasets_final\step_5.csv" 

# === Load the dataset ===
df = pd.read_csv(file_path)

# === Define the columns to scale ===
columns = [col for col in df.columns if col != 'Label']

# === Apply MinMaxScaler ===
min_max_scaler = MinMaxScaler().fit(df[columns])
df[columns] = min_max_scaler.transform(df[columns])

# === Save the MinMaxScaler ===
scaler_path = r"D:\4th semester\SE\project\Models\min_max_scaler.pkl"  # Path to save the scaler
joblib.dump(min_max_scaler, scaler_path)  # Save the scaler for later use

# === Output Path ===
output_path = r"D:\4th semester\SE\project\Datasets_final\step_6.csv"  # Specify your desired output path

# === Save the scaled dataset ===
df.to_csv(output_path, index=False)

# === Print the first few rows of the scaled dataset ===
print(f"Scaled dataset saved at: {output_path}")
print(f"Scaled dataset shape: {df.shape}")
print(df.head())


Scaled dataset saved at: D:\4th semester\SE\project\Datasets_final\step_6.csv
Scaled dataset shape: (1011097, 64)
   Dst Port  Protocol  Timestamp  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  \
0  0.000000  0.000000   0.626427       0.938681      0.007576      0.000000   
1  0.000000  0.000000   0.630339       0.938679      0.007576      0.000000   
2  0.000000  0.000000   0.634251       0.938655      0.007576      0.000000   
3  0.000336  0.352941   0.639205       0.053783      0.053030      0.021739   
4  0.000336  0.352941   0.639436       0.073367      0.049242      0.023913   

   TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  \
0         0.000000         0.000000         0.000000              0.0   
1         0.000000         0.000000         0.000000              0.0   
2         0.000000         0.000000         0.000000              0.0   
3         0.044745         0.004184         0.509589              0.0   
4         0.041278         0.004066         0.

In [14]:
file_path =r"D:\4th semester\SE\project\Datasets_final\step_6.csv"   # Replace with your dataset path

# === Load the dataset ===
df = pd.read_csv(file_path)

# === Randomize the rows ===
df = df.sample(frac=1)  # Randomize rows' sequence

# === Specify attack types to balance against "Benign" ===
attack_labels = ["FTP-BruteForce", "SSH-Bruteforce"]  # Add more attack types if needed

# Initialize an empty dictionary to store balanced data
balanced_data = {}

# === Loop through each attack type and balance the dataset ===
for attack in attack_labels:
    df_attack = df[df["Label"] == attack]
    df_benign = df[df["Label"] == "Benign"][:df_attack.shape[0]]  # Balance "Benign" samples with attack samples
    
    # Concatenate the "Benign" data with the attack data
    balanced_df = pd.concat([df_benign, df_attack], axis=0)
    
    # Add the balanced dataframe to the dictionary
    balanced_data[attack] = balanced_df

    # Output the balanced dataset
    output_path = f"D:\\4th semester\\SE\\project\\Datasets_final\\balanced_{attack}.csv"
    balanced_df.to_csv(output_path, index=False)
    print(f"Balanced dataset for {attack} saved at {output_path}")


Balanced dataset for FTP-BruteForce saved at D:\4th semester\SE\project\Datasets_final\balanced_FTP-BruteForce.csv
Balanced dataset for SSH-Bruteforce saved at D:\4th semester\SE\project\Datasets_final\balanced_SSH-Bruteforce.csv


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import os
import joblib  # For saving model and encoder

# === File Paths for Balanced Datasets ===
file_paths = [
    r"D:\4th semester\SE\project\Datasets_final\balanced_FTP-BruteForce.csv",
    r"D:\4th semester\SE\project\Datasets_final\balanced_SSH-Bruteforce.csv"
]

# === Output Directory to Save Models and Encoders ===
model_output_dir = r"D:\4th semester\SE\project\Models_final"
os.makedirs(model_output_dir, exist_ok=True)

# === Training Loop ===
for file_path in file_paths:
    df = pd.read_csv(file_path)

    # === Split features and target ===
    X = df.drop(columns=["Label"])
    y = df["Label"]

    # === Encode Labels: 0 = Benign, 1 = Attack ===
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # === Save Label Encoder ===
    encoder_name = os.path.basename(file_path).replace(".csv", "_label_encoder.pkl")
    encoder_path = os.path.join(model_output_dir, encoder_name)
    joblib.dump(label_encoder, encoder_path)
    print(f"Label encoder saved at: {encoder_path}")

    # === Train-test split ===
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # === Train model ===
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # === Evaluate on Training Data ===
    y_train_pred = model.predict(X_train)
    print(f"\n==== Training on dataset: {os.path.basename(file_path)} ====\n")
    print("Training Accuracy: ", accuracy_score(y_train, y_train_pred))
    print("Training Precision:", precision_score(y_train, y_train_pred))
    print("Training Recall:   ", recall_score(y_train, y_train_pred))
    print("Training F1 Score: ", f1_score(y_train, y_train_pred))
    print("Training Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred))

    # === Evaluate on Test Data ===
    y_pred = model.predict(X_test)
    print("\n==== Evaluating model trained on: {} ====\n".format(os.path.basename(file_path)))
    print("Test Accuracy:  ", accuracy_score(y_test, y_pred))
    print("Test Precision: ", precision_score(y_test, y_pred))
    print("Test Recall:    ", recall_score(y_test, y_pred))
    print("Test F1 Score:  ", f1_score(y_test, y_pred))
    print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # === Save the model ===
    model_name = os.path.basename(file_path).replace(".csv", "_model.pkl")
    model_path = os.path.join(model_output_dir, model_name)
    joblib.dump(model, model_path)
    print(f"Model saved at: {model_path}")


Label encoder saved at: D:\4th semester\SE\project\Models_final\balanced_FTP-BruteForce_label_encoder.pkl

==== Training on dataset: balanced_FTP-BruteForce.csv ====

Training Accuracy:  1.0
Training Precision: 1.0
Training Recall:    1.0
Training F1 Score:  1.0
Training Confusion Matrix:
 [[154641      0]
 [     0 154725]]

==== Evaluating model trained on: balanced_FTP-BruteForce.csv ====

Test Accuracy:   1.0
Test Precision:  1.0
Test Recall:     1.0
Test F1 Score:   1.0
Test Confusion Matrix:
 [[38713     0]
 [    0 38629]]
Model saved at: D:\4th semester\SE\project\Models_final\balanced_FTP-BruteForce_model.pkl
Label encoder saved at: D:\4th semester\SE\project\Models_final\balanced_SSH-Bruteforce_label_encoder.pkl

==== Training on dataset: balanced_SSH-Bruteforce.csv ====

Training Accuracy:  1.0
Training Precision: 1.0
Training Recall:    1.0
Training F1 Score:  1.0
Training Confusion Matrix:
 [[150051      0]
 [     0 150091]]

==== Evaluating model trained on: balanced_SSH-Br