In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv("All-Processed-Reduced-Cleaned.csv")

print("Initial shape:", df.shape)
df.head()

In [None]:
summary = []

for col in df.columns:
    col_data = df[col]
    col_info = {
        "column": col,
        "dtype": col_data.dtype,
        "n_unique": col_data.nunique(),
        "nulls": col_data.isnull().sum()
    }
    
    if pd.api.types.is_numeric_dtype(col_data):
        col_info.update({
            "min": col_data.min(),
            "max": col_data.max(),
            "mean": col_data.mean(),
            "inf_count": np.isinf(col_data).sum(),
            "-1_count": (col_data == -1).sum()
        })
    else:
        col_info.update({
            "top_value": col_data.value_counts().idxmax(),
            "top_freq": col_data.value_counts().max()
        })
    
    summary.append(col_info)

summary_df = pd.DataFrame(summary)

# Display the summary nicely
pd.set_option('display.max_rows', None)  # show all rows
print(summary_df)


In [None]:
# --- Step 1: Clean duration-related features ---
duration_cols = [
    "Flow Duration", "Flow IAT Mean", "Flow IAT Std", "Flow IAT Max", "Flow IAT Min",
    "Fwd IAT Tot", "Fwd IAT Mean", "Fwd IAT Std", "Fwd IAT Max", "Fwd IAT Min",
    "Bwd IAT Tot", "Bwd IAT Mean", "Bwd IAT Std", "Bwd IAT Max", "Bwd IAT Min"
]
for col in duration_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # ensure numeric
        df[col] = df[col].clip(lower=0)

# --- Step 2: Replace placeholders and invalid values ---
# Use np.nan for missing-like values to keep track
df.replace([-1, np.inf, -np.inf], np.nan, inplace=True)

# Fill remaining NaNs with column mean (better than 0 for some models)
df.fillna(df.mean(numeric_only=True), inplace=True)

# --- Step 3: Drop constant or nearly constant features ---
nunique = df.nunique(dropna=False)
constant_cols = nunique[nunique <= 1].index
df.drop(columns=constant_cols, inplace=True)

# --- Step 4: Ensure only numeric columns for model input ---
# Keep label separately before this step
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_cols) > 0:
    print(f"⚠️ Found non-numeric columns: {list(non_numeric_cols)}")
    df.drop(columns=non_numeric_cols, inplace=True)

print(f"✅ Cleaning completed. Dropped {len(constant_cols)} constant columns.")
