In [21]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
import joblib
import matplotlib.pyplot as plt

# Step 1: Load the data
df_normal = pd.read_csv("final-normal-data-set.csv")
df_anormal = pd.read_csv("final-anormal-data-set.csv", low_memory=False)

df_normal['label'] = 0
df_anormal['label'] = 1

In [22]:
df_all = pd.concat([df_normal, df_anormal], ignore_index=True)

In [23]:
# Step 2: Remove columns where all values are 0
df = df_all.loc[:, (df_all != 0).any(axis=0)]

In [24]:

# Define useless columns
useless_cols = [
    'diskio_sda1_disk_name', 'diskio_sda1_key', 'diskio_sda_disk_name', 'diskio_sda_key',
    'fs_/_device_name', 'fs_/_fs_type', 'fs_/_key', 'fs_/_mnt_point',
    'network_lo_interface_name', 'network_lo_key', 'percpu_0_key',
    'system_hostname', 'system_hr_name', 'system_linux_distro',
    'system_os_name', 'system_os_version', 'system_platform', 'timestamp' 
]

# Drop the useless columns if they exist in your dataset
df = df.drop(columns=[col for col in useless_cols if col in df.columns])


In [None]:
# Define Linux-only (to be removed)
linux_only = [
    'cpu_guest', 'cpu_guest_nice', 'cpu_iowait', 'cpu_irq',
    'cpu_nice', 'cpu_softirq', 'cpu_steal',
    'load_cpucore', 'load_min1', 'load_min5', 'load_min15',
    'mem_buffers', 'mem_shared', 'memswap_sin', 'memswap_sout',
    'network_lo_cumulative_cx', 'network_lo_cumulative_rx', 'network_lo_cumulative_tx',
    'network_lo_cx', 'network_lo_rx', 'network_lo_time_since_update', 'network_lo_tx',
    'percpu_0_iowait', 'percpu_0_nice', 'percpu_0_softirq'
]

# Drop only the columns that exist in the dataset
df = df.drop(columns=[col for col in linux_only if col in df.columns])
df.to_csv("windows_features_dataset.csv", index=False)
print("✅ Saved quickly as 'windows_features_dataset.csv'")

In [27]:
#➡️ Keep CPU, memory, load, network, process count, and timestamp, label
#➡️ Drop static identifiers, OS info, and disk labels
# Get feature (column) names
feature_names = df.columns.tolist()
print("Feature names:", feature_names)

# Get number of features
num_features = len(df.columns)
print("Number of features:", num_features)

Feature names: ['cpu_idle', 'cpu_system', 'cpu_total', 'cpu_user', 'diskio_sda1_read_bytes', 'diskio_sda1_time_since_update', 'diskio_sda1_write_bytes', 'diskio_sda_read_bytes', 'diskio_sda_time_since_update', 'diskio_sda_write_bytes', 'fs_/_free', 'fs_/_percent', 'fs_/_size', 'fs_/_used', 'mem_active', 'mem_available', 'mem_cached', 'mem_free', 'mem_inactive', 'mem_percent', 'mem_total', 'mem_used', 'memswap_free', 'memswap_percent', 'memswap_total', 'memswap_used', 'percpu_0_idle', 'percpu_0_system', 'percpu_0_total', 'percpu_0_user', 'processcount_running', 'processcount_sleeping', 'processcount_thread', 'processcount_total', 'label']
Number of features: 35
