In [123]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
import numpy as np
import hashlib
from typing import List, Tuple

# File paths and column definitions
hai_21_train_files = sorted(glob("../hai-21.03/train1.csv"))
hai_21_test_files = sorted(glob("../hai-21.03/test1.csv"))
hai_21_attack_cols = ['attack', 'attack_P1', 'attack_P2', 'attack_P3']

print(f"Found {len(hai_21_train_files)} training files: {hai_21_train_files}")  
print(f"Found {len(hai_21_test_files)} test files: {hai_21_test_files}")

Found 1 training files: ['../hai-21.03/train1.csv']
Found 1 test files: ['../hai-21.03/test1.csv']


In [124]:
class BloomFilter:
    """Bloom filter implementation for n-gram anomaly detection"""
    
    def __init__(self, size=1_000_000, k=5):
        self.size = size
        self.k = k
        self.bloom = np.zeros(size, dtype=bool)
        
    def _hashes(self, item: str) -> List[int]:
        """Generate k hash values for an item"""
        return [
            int(hashlib.sha1((str(seed) + item).encode()).hexdigest(), 16) % self.size
            for seed in range(self.k)
        ]
    
    def add(self, item: str):
        """Add item to bloom filter"""
        for h in self._hashes(item):
            self.bloom[h] = True
    
    def check(self, item: str) -> bool:
        """Check if item might be in the filter"""
        return all(self.bloom[h] for h in self._hashes(item))


In [125]:
def load_and_clean_data(train_files: List[str], test_files: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Step 1: Load & Clean Data
    - Read all train CSVs for HAI 21.03
    - Drop timestamp and attack labels
    - Remove rows where Attack == 1 (use only normal data for training)
    """
    print("\n=== Step 1: Loading & Cleaning Data ===")
    
    # Load training data
    train_dfs = []
    for file in train_files:
        print(f"Loading {file}...")
        df = pd.read_csv(file)
        
        # Remove attack rows (keep only normal data for training)
        if 'attack' in df.columns:
            normal_mask = df['attack'] == 0
            df = df[normal_mask]
        
        train_dfs.append(df)
    
    # Load test data
    test_dfs = []
    for file in test_files:
        print(f"Loading {file}...")
        df = pd.read_csv(file)
        test_dfs.append(df)
    
    # Combine all data
    train_df = pd.concat(train_dfs, ignore_index=True)
    test_df = pd.concat(test_dfs, ignore_index=True)
    
    # Drop timestamp and attack columns
    cols_to_drop = ['time'] + [col for col in hai_21_attack_cols if col in train_df.columns]
    train_df = train_df.drop(columns=cols_to_drop, errors='ignore')
    test_df = test_df.drop(columns=cols_to_drop, errors='ignore')
    
    print(f"Final training data shape: {train_df.shape}")
    print(f"Final test data shape: {test_df.shape}") 
    
    # Handle NaN values
    train_df = train_df.fillna(method='ffill').fillna(0)
    test_df = test_df.fillna(method='ffill').fillna(0)
    
    # train_df = train_df.sample(n=1000)
    # test_df = test_df.sample(n=1000)
    return train_df, test_df


In [126]:
def normalize_and_quantize(train_data: pd.DataFrame, test_data: pd.DataFrame, Q: int = 20) -> Tuple[np.ndarray, np.ndarray]:
    """
    Step 2: Normalize & Quantize
    - Use z-score normalization instead of min-max
    - Quantize into discrete bins (0 to Q-1)
    """
    print(f"\n=== Step 2: Normalizing & Quantizing (Q={Q}) ===")
    train_data = train_data.round()
    test_data = test_data.round()
    # Calculate z-score normalization parameters from training data only
    mean_vals = train_data.mean()
    std_vals = train_data.std()
    
    print(f"Data ranges before normalization:")
    print(f"  Train: min={train_data.min().min():.2f}, max={train_data.max().max():.2f}")
    print(f"  Test: min={test_data.min().min():.2f}, max={test_data.max().max():.2f}")
    
    # Z-score normalization
    train_normalized = (train_data - mean_vals) / std_vals
    test_normalized = (test_data - mean_vals) / std_vals
    
    # Handle any remaining NaN/inf values
    train_normalized = train_normalized.fillna(0)
    test_normalized = test_normalized.fillna(0)
    train_normalized = train_normalized.replace([np.inf, -np.inf], 0)
    test_normalized = test_normalized.replace([np.inf, -np.inf], 0)
    
    # Clip extreme values to reasonable range (e.g., -3 to 3 standard deviations)
    train_normalized = np.clip(train_normalized, -3, 3)
    test_normalized = np.clip(test_normalized, -3, 3)
    
    # Rescale to [0, 1] range for quantization
    # Map [-3, 3] to [0, 1]
    train_scaled = (train_normalized + 3) / 6
    test_scaled = (test_normalized + 3) / 6
    
    # Quantize into discrete bins [0, Q-1]
    train_quantized = np.floor(train_scaled * Q).astype(int)
    test_quantized = np.floor(test_scaled * Q).astype(int)
    
    # Ensure values are in valid range
    train_quantized = np.clip(train_quantized, 0, Q-1)
    test_quantized = np.clip(test_quantized, 0, Q-1)
    
    print(f"Quantized ranges:")
    print(f"  Train: min={train_quantized.min()}, max={train_quantized.max()}")
    print(f"  Test: min={test_quantized.min()}, max={test_quantized.max()}")
    
    return train_quantized, test_quantized


In [127]:
def build_state_strings(quantized_data: np.ndarray) -> List[str]:
    """
    Step 3: Build State String per Time Step
    Convert each time row into a state string
    """
    print(f"\n=== Step 3: Building State Strings ===")
    state_strings = []
    for i, row in enumerate(quantized_data):
        state = "_".join(map(str, row))
        state_strings.append(state)
    
    #print(f"Generated {len(state_strings)} state strings")
    return state_strings

In [128]:
def generate_ngrams(state_strings: List[str], n: int = 3) -> List[str]:
    """
    Generate N-grams
    Slide a window of size n over the sequence of state strings
    """
    print(f"\n=== Step 4: Generating {n}-grams ===")
    
    ngrams = []
    for i in range(len(state_strings) - n + 1):
        sequence = state_strings[i:i+n]
        ngram = "→".join(sequence)
        ngrams.append(ngram)
        
        if i < 3:  # Show first few examples
            print(f"  N-gram {i}: {ngram[:100]}...")
    
    print(f"Generated {len(ngrams)} n-grams")
    return ngrams


In [129]:
def train_bloom_filter(ngrams: List[str], M: int = 1_000_000, k: int = 5) -> BloomFilter:
    """
    Hash and Store in Bloom Filter
    Train the bloom filter with normal n-grams
    """
    print(f"\n=== Training Bloom Filter (M={M}, k={k}) ===")
    
    bloom = BloomFilter(size=M, k=k)
    
    for i, ngram in enumerate(ngrams):
        bloom.add(ngram)
        
        if i % 10000 == 0:
            print(f"  Added {i+1}/{len(ngrams)} n-grams")
    
    print(f"Bloom filter training completed!")
    return bloom


In [130]:
def detect_anomalies(test_ngrams: List[str], bloom_filter: BloomFilter) -> Tuple[List[bool], float]:
    """
    Detect anomalies using the trained bloom filter
    """
    print(f"\n=== Anomaly Detection ===")
    
    anomaly_scores = []
    for ngram in test_ngrams:
        is_normal = bloom_filter.check(ngram)
        anomaly_scores.append(not is_normal)  # Anomaly if not in bloom filter
    
    anomaly_rate = sum(anomaly_scores) / len(anomaly_scores)
    print(f"Anomaly rate: {anomaly_rate:.4f} ({sum(anomaly_scores)}/{len(anomaly_scores)})")
    
    return anomaly_scores, anomaly_rate


MAIN

In [131]:
# Step 1: Load and clean data
train_df, test_df = load_and_clean_data(hai_21_train_files, hai_21_test_files)


=== Step 1: Loading & Cleaning Data ===
Loading ../hai-21.03/train1.csv...
Loading ../hai-21.03/test1.csv...
Final training data shape: (216001, 79)
Final test data shape: (43201, 79)


  train_df = train_df.fillna(method='ffill').fillna(0)
  test_df = test_df.fillna(method='ffill').fillna(0)


In [133]:
train_quantized, test_quantized = normalize_and_quantize(train_df, test_df, Q=10)


=== Step 2: Normalizing & Quantizing (Q=10) ===
Data ranges before normalization:
  Train: min=-288.00, max=540833.00
  Test: min=-288.00, max=540833.00
Quantized ranges:
  Train: min=P1_B2004      5
P1_B2016      4
P1_B3004      1
P1_B3005      0
P1_B4002      3
             ..
P4_ST_LD      0
P4_ST_PO      0
P4_ST_PS      4
P4_ST_PT01    0
P4_ST_TT01    0
Length: 79, dtype: int64, max=P1_B2004      5
P1_B2016      8
P1_B3004      9
P1_B3005      6
P1_B4002      8
             ..
P4_ST_LD      9
P4_ST_PO      9
P4_ST_PS      9
P4_ST_PT01    9
P4_ST_TT01    6
Length: 79, dtype: int64
  Test: min=P1_B2004      5
P1_B2016      4
P1_B3004      4
P1_B3005      2
P1_B4002      3
             ..
P4_ST_LD      0
P4_ST_PO      0
P4_ST_PS      4
P4_ST_PT01    0
P4_ST_TT01    0
Length: 79, dtype: int64, max=P1_B2004      5
P1_B2016      8
P1_B3004      5
P1_B3005      6
P1_B4002      6
             ..
P4_ST_LD      9
P4_ST_PO      9
P4_ST_PS      9
P4_ST_PT01    9
P4_ST_TT01    6
Length: 79, dt

In [141]:
train_quantized.head()

Unnamed: 0,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,P1_FCV01Z,...,P4_HT_PO,P4_HT_PS,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01
0,5,4,4,3,8,6,6,7,6,6,...,3,4,3,5,3,3,3,4,5,5
1,5,4,4,3,8,6,6,7,6,6,...,3,4,3,5,3,3,3,4,5,5
2,5,4,4,3,8,6,6,7,6,6,...,3,4,3,5,3,3,3,4,5,5
3,5,4,4,3,8,6,6,7,6,6,...,3,4,3,5,3,3,3,4,5,5
4,5,4,4,3,8,6,6,7,6,6,...,3,4,3,5,3,3,3,4,5,5


In [142]:
train_states = build_state_strings(train_quantized.values) 
test_states = build_state_strings(test_quantized.values)


=== Step 3: Building State Strings ===

=== Step 3: Building State Strings ===


In [143]:
train_states[0]

'5_4_4_3_8_6_6_7_6_6_3_3_3_3_3_3_6_6_2_3_2_2_4_2_2_5_5_4_4_5_5_5_5_5_5_5_3_4_5_5_5_2_5_4_5_5_5_5_5_5_5_5_5_5_5_5_6_6_5_4_8_7_3_5_5_5_5_5_3_3_4_3_5_3_3_3_4_5_5'

In [144]:
n = 2  # n-gram order
train_ngrams = generate_ngrams(train_states, n)
test_ngrams = generate_ngrams(test_states, n)


=== Step 4: Generating 2-grams ===
  N-gram 0: 5_4_4_3_8_6_6_7_6_6_3_3_3_3_3_3_6_6_2_3_2_2_4_2_2_5_5_4_4_5_5_5_5_5_5_5_3_4_5_5_5_2_5_4_5_5_5_5_5_5_...
  N-gram 1: 5_4_4_3_8_6_6_7_6_6_3_3_3_3_3_3_6_6_2_3_2_2_4_2_2_5_5_4_4_5_5_5_5_5_5_5_3_4_5_5_5_3_5_4_5_5_5_5_5_5_...
  N-gram 2: 5_4_4_3_8_6_6_7_6_6_3_3_3_3_3_3_6_6_2_3_2_2_4_2_2_5_5_4_4_5_5_5_5_5_5_5_3_4_5_5_5_6_5_4_5_5_5_5_5_5_...
Generated 216000 n-grams

=== Step 4: Generating 2-grams ===
  N-gram 0: 5_8_5_2_6_6_6_7_6_6_3_3_2_2_3_4_6_6_2_2_3_4_5_3_2_5_5_4_4_5_5_5_5_5_5_5_3_3_5_5_5_5_5_7_5_5_5_5_9_9_...
  N-gram 1: 5_8_5_2_6_6_6_7_6_6_3_3_2_2_3_3_6_6_2_2_3_4_5_3_2_5_5_4_4_5_5_5_5_5_5_5_3_3_5_5_5_5_5_7_5_5_5_5_9_9_...
  N-gram 2: 5_8_5_2_6_6_6_7_6_6_3_3_2_2_3_3_6_6_2_2_3_4_5_3_2_5_5_4_4_5_5_5_5_5_5_5_3_3_5_5_5_3_5_6_5_5_5_5_7_7_...
Generated 43200 n-grams


In [145]:
bloom_filter = train_bloom_filter(train_ngrams, M=1_000_000, k=5)


=== Training Bloom Filter (M=1000000, k=5) ===
  Added 1/216000 n-grams
  Added 10001/216000 n-grams
  Added 20001/216000 n-grams
  Added 30001/216000 n-grams
  Added 40001/216000 n-grams
  Added 50001/216000 n-grams
  Added 60001/216000 n-grams
  Added 70001/216000 n-grams
  Added 80001/216000 n-grams
  Added 90001/216000 n-grams
  Added 100001/216000 n-grams
  Added 110001/216000 n-grams
  Added 120001/216000 n-grams
  Added 130001/216000 n-grams
  Added 140001/216000 n-grams
  Added 150001/216000 n-grams
  Added 160001/216000 n-grams
  Added 170001/216000 n-grams
  Added 180001/216000 n-grams
  Added 190001/216000 n-grams
  Added 200001/216000 n-grams
  Added 210001/216000 n-grams
Bloom filter training completed!


In [146]:
anomalies, anomaly_rate = detect_anomalies(test_ngrams, bloom_filter)


=== Anomaly Detection ===
Anomaly rate: 0.8742 (37764/43200)


In [139]:
print(f"\n=== Summary ===")
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"N-gram order: {n}")
print(f"Quantization levels: 20")
print(f"Bloom filter size: 1,000,000 bits")
print(f"Hash functions: 5")
print(f"Anomaly rate: {anomaly_rate:.4f}")


=== Summary ===
Training samples: 216001
Test samples: 43201
N-gram order: 2
Quantization levels: 20
Bloom filter size: 1,000,000 bits
Hash functions: 5
Anomaly rate: 0.8742
