In [2]:
from utils import load_and_clean_data
from glob import glob


train_files = sorted(glob("../haiend-23.05/end-train1.csv"))
test_files = sorted(glob("../haiend-23.05/end-test1.csv"))
label_files = sorted(glob("../haiend-23.05/label-test1.csv"))

haiEnd_df = load_and_clean_data(train_files, test_files, attack_cols=None, label_files=label_files) # merge train and test data



=== Loading & Cleaning Data ===
Loading ../haiend-23.05/end-train1.csv...
  Original shape: (280800, 226)
Loading ../haiend-23.05/end-test1.csv...
  Original shape: (54000, 226)
Loading labels from ../haiend-23.05/label-test1.csv...
  Labels merged. Test shape: (54000, 227)
  Attack rows in labels: 1821
Final training data shape: (280800, 225)
Final test data shape: (54000, 226)
Total attack rows in merged dataset: 1821


In [3]:

X = haiEnd_df.drop(columns=['label'], errors='ignore') # label here refers to attack label 0 or 1
y = haiEnd_df['label']

In [4]:
import numpy as np

def make_kfold_indices(n_samples, k=5, seed=42): # generates indices for k-fold cross-validation.
    np.random.seed(seed) # seed for reproducibility
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    fold_sizes = np.full(k, n_samples // k, dtype=int)
    fold_sizes[: n_samples % k] += 1 # distribute the samples as evenly as possible
    current = 0
    folds = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        folds.append(indices[start:stop])
        current = stop
    return folds

In [5]:
def scenario_1_split(X, y, k=5, seed=42, balance_attacks=False):
    """
    Scenario 1:
      Train on normal data only.
      Test on normal (current fold) + all attack samples.
    """
    normal_idx = np.where(y == 0)[0] # returns indices where condition is met
    attack_idx = np.where(y == 1)[0]
    folds = make_kfold_indices(len(normal_idx), k, seed)

    for fold_idx in range(k):
        test_normal_idx = normal_idx[folds[fold_idx]]
        train_normal_idx = np.setdiff1d(normal_idx, test_normal_idx)

        # Optionally balance attack samples in test
        if balance_attacks:
            n_attack = len(test_normal_idx)
            attack_sample_idx = np.random.choice(attack_idx, n_attack, replace=False)
        else:
            attack_sample_idx = attack_idx

        test_idx = np.concatenate([test_normal_idx, attack_sample_idx])
        train_idx = train_normal_idx

        yield fold_idx, train_idx, test_idx

In [6]:
def scenario_2_split(X, y, n_clusters=3, k=5, seed=42, balance_attacks=False):
    """
    Scenario 2:
      Train on normal + (n-1) attack clusters.
      Test on normal (current fold) + all attack samples.
    """
    normal_idx = np.where(y == 0)[0]
    attack_idx = np.where(y == 1)[0]
    np.random.seed(seed)
    np.random.shuffle(attack_idx)
    attack_clusters = np.array_split(attack_idx, n_clusters)
    folds = make_kfold_indices(len(normal_idx), k, seed)

    for fold_idx in range(k):
        test_normal_idx = normal_idx[folds[fold_idx]]
        train_normal_idx = np.setdiff1d(normal_idx, test_normal_idx)

        # Loop through each held-out cluster (you can also randomize one per fold)
        for held_out in range(n_clusters):
            # Combine all clusters except the held-out one for training
            train_attack_idx = np.concatenate(
                [attack_clusters[i] for i in range(n_clusters) if i != held_out]
            )
            test_attack_idx = np.concatenate(attack_clusters)

            # Optionally balance attack counts
            if balance_attacks:
                n_attack = len(test_normal_idx)
                test_attack_idx = np.random.choice(test_attack_idx, n_attack, replace=False)

            train_idx = np.concatenate([train_normal_idx, train_attack_idx])
            test_idx = np.concatenate([test_normal_idx, test_attack_idx])

            yield (fold_idx, held_out, train_idx, test_idx)

In [11]:
def run_kfold_model(X, y, model, scenario_fn):
    for fold_idx, train_idx, test_idx in scenario_fn(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = np.mean(y_pred == y_test)
        print(f"Fold {fold_idx+1}: Accuracy={acc:.4f}, Train={len(train_idx)}, Test={len(test_idx)}")

        # print row-by-row predictions
        # for i, pred in zip(test_idx, y_pred):
        #     label_str = "ATTACK" if pred == 1 else "NORMAL"
        #     print(f"Row {i}: {label_str}")


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import OneClassSVM



In [None]:
model = RandomForestClassifier(
    n_estimators=100,      # number of trees
    max_depth=None,       # let trees grow fully
    random_state=42
)


In [None]:
# model = KNeighborsClassifier(
#     n_neighbors=5,     # typical default
#     weights='distance' # or 'uniform'
# )

In [None]:

run_kfold_model(X, y, model, scenario_1_split)