In [1]:
from micromlgen import port
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import joblib
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, label_binarize
from sklearn.model_selection import train_test_split

In [2]:
# Set the seed for reproducibility
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# BATCHSIZE = 128  # Not used in Random Forest
CLASSES = 13

# INPUT_SIZE = 83  # Not used in Random Forest, but retained for consistency

In [3]:
def load_and_preprocess_data(filepath):
    """
    Loads data from a CSV file, replaces placeholders, and separates features from target.
    
    Args:
        filepath (str): Path to the CSV file.
        
    Returns:
        data (pd.DataFrame): Feature dataframe.
        target (pd.Series): Target labels.
        feature_names (list): List of feature names.
    """
    data = pd.read_csv(filepath)
    
    # Replace '-' with 'None' in categorical columns
    data['service'] = data['service'].replace('-', 'None')
    data['proto'] = data['proto'].replace('-', 'None')
    
    # Separate features and target
    feature_names = data.drop(columns=['Attack_type']).columns.tolist()
    target = data['Attack_type']
    data = data.drop(columns=['Attack_type'])
    
    return data, target, feature_names

# Load the dataset
data, target, feature_names = load_and_preprocess_data('./FCNN/RT_IOT2022_new.csv')



def encode_targets(target):
    """
    Encodes categorical target labels into numerical codes.
    
    Args:
        target (pd.Series): Categorical target labels.
        
    Returns:
        target_encoded (np.ndarray): Encoded target labels.
        class_labels (list): List of class names.
        target_encoder (LabelEncoder): Fitted LabelEncoder instance.
    """
    target_encoder = LabelEncoder()
    target_encoded = target_encoder.fit_transform(target)
    class_labels = target_encoder.classes_
    return target_encoded, class_labels, target_encoder


# In[24]:


# Encode target labels
Y_encoded, class_labels, target_encoder = encode_targets(target)


# In[25]:


print(f"Number of unique classes: {len(class_labels)}")
print(f"Class labels: {class_labels}")


# In[26]:


# Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    data, Y_encoded, test_size=0.2, random_state=SEED, stratify=Y_encoded
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Clean up to save memory
del data
del target

Number of unique classes: 13
Class labels: ['ARP_poisioning' 'Alexa' 'DDOS_Slowloris' 'DOS_SYN_Hping' 'MQTT_Publish'
 'Metasploit_Brute_Force_SSH' 'NMAP_FIN_SCAN' 'NMAP_OS_DETECTION'
 'NMAP_TCP_scan' 'NMAP_UDP_SCAN' 'NMAP_XMAS_TREE_SCAN' 'Thing_Speak'
 'Wipro_bulb']
Training set size: (167967, 83)
Test set size: (41992, 83)


In [4]:
# Load your best model (pipeline that includes preprocessing and classifier)
best_model_path = "./ML Models/best_svm_rbf_only_model_with_preproc__100.joblib"
best_model = joblib.load(best_model_path)


preprocessor, classifier = joblib.load(best_model_path)   # ← this file is a tuple

# 2) Wrap into a single Pipeline
pipe = Pipeline([("preprocessor", preprocessor), ("clf", classifier)])


# Transform the test data
X_test_transformed = preprocessor.transform(X_test)


# Print the shape before and after preprocessing
print(f"Before preprocessing: X_test shape: {X_test.shape}")
print(f"After preprocessing: X_test_transformed shape: {X_test_transformed.shape}")  

Before preprocessing: X_test shape: (41992, 83)
After preprocessing: X_test_transformed shape: (41992, 94)


In [5]:
print(type(preprocessor), type(classifier))
n_features = X_test_transformed.shape[1]
n_sv       = getattr(classifier, "support_vectors_", np.empty((0,0))).shape[0]
n_classes  = len(classifier.classes_)

<class 'sklearn.compose._column_transformer.ColumnTransformer'> <class 'sklearn.svm._classes.SVC'>


In [6]:
# crude flash footprint estimate (floats only; ignores code/struct overhead)
approx_bytes = n_sv * n_features * 4 + n_sv * 4 + n_classes * 4
print(f"Features={n_features}, SVs={n_sv}, approx float-bytes={approx_bytes:,}")

Features=94, SVs=1980, approx float-bytes=752,452


In [7]:
# Make a clean index→label map (strings!)
labels   = [str(l) for l in classifier.classes_.tolist()]
classmap = {i: lbl for i, lbl in enumerate(labels)}

c_code = port(classifier, classmap=classmap)   # emits predict(float*) API
with open("./ML Models/svm_model.h", "w") as f:
    f.write(c_code)

print("Wrote svm_model.h with classmap:", classmap)

Wrote svm_model.h with classmap: {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: '10', 11: '11', 12: '12'}


In [8]:
# 1) Sanity: predict on a few samples matches sklearn
import numpy as np
from micromlgen import port

# Use your existing transform
X0 = X_test[:5]
X0_t = preprocessor.transform(X0)
y0 = classifier.predict(X0_t)

# Build a tiny pure-Python adapter from the generated header (optional):
# micromlgen also supports a "port" to Python for quick parity checks if needed.

print("Sklearn preds:", y0)
print("Classes order used in classmap:", labels)

Sklearn preds: [3 1 3 3 0]
Classes order used in classmap: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']


In [10]:
import numpy as np

# Pick a random sample index
idx = np.random.randint(len(X_test))

# Preprocess that sample
x0 = preprocessor.transform(X_test[idx:idx+1]).astype(np.float32).ravel()
D  = x0.size
print(f"Random sample index: {idx}")
print("INPUT_DIM =", D)

# Predict its class
y_pred = classifier.predict(x0.reshape(1, -1))[0]
print(f"Predicted class: {y_pred}")

# Pretty-print as a C float array with '0.0f' and '1.0f' for integers
def to_c_array(a, var='Xf', name_dim='INPUT_DIM', per_line=8):
    def fmt(v):
        # Format as float literal with 'f' suffix, forcing decimal point
        if v == 0:
            return "0.0f"
        elif v == 1:
            return "1.0f"
        else:
            return f"{v:.8g}f"
    vals = [fmt(v) for v in a]
    lines = [", ".join(vals[i:i+per_line]) for i in range(0, len(vals), per_line)]
    return f"static const float {var}[{name_dim}] = {{\n  " + ",\n  ".join(lines) + "\n};"

print(to_c_array(x0))


Random sample index: 15795
INPUT_DIM = 94
Predicted class: 1
static const float Xf[INPUT_DIM] = {
  0.30141717f, -0.080645718f, 0.020115202f, 1.1857766f, 1.2068995f, 1.3959353f, 0.34605837f, -0.6197083f,
  -0.61959982f, -0.61965555f, 0.78057432f, 1.1293296f, -0.0075954632f, -0.30413029f, 0.98806405f, 0.17217696f,
  -0.045220543f, -0.38211912f, -1.0798947f, -0.97317886f, 1.4306349f, 0.41692045f, 1.234928f, -0.098205313f,
  0.0f, -0.016975235f, -0.016449537f, -0.9566254f, 1.4877127f, 1.4213978f, 0.72330362f, 0.88266402f,
  -0.26095688f, 1.4374388f, 0.25050029f, 0.601695f, 1.3703147f, -0.28127944f, 1.3950447f, 0.51172072f,
  0.69229823f, 1.061035f, -0.010318507f, 0.16696407f, 0.020379305f, -0.068191603f, -0.05046995f, -0.016098274f,
  -0.20680085f, -0.0024087375f, -0.05154267f, -0.091281354f, -0.022513881f, 0.16647471f, 0.020094311f, -0.098856926f,
  -0.062487517f, -0.60359454f, 1.0326749f, 0.97062308f, 1.2568408f, 0.13227861f, 2.33863f, 0.031060431f,
  2.3846581f, 0.094111674f, -0.064262

In [9]:
from pathlib import Path
import numpy as np
import joblib
import lightgbm as lgb
import treelite
from treelite.frontend import load_lightgbm_model
import tl2cgen
import os

# --- Paths (everything under ./ML Models) ---
BASE      = Path("./ML Models")               # folder with a space - safe via Path
BASE.mkdir(parents=True, exist_ok=True)
BUILD     = BASE / "build"
BUILD.mkdir(parents=True, exist_ok=True)

JOBLIB    = BASE / "best_lgbm_model_with_preproc__100.joblib"
TXT_MODEL = BASE / "lgbm.txt"
PKG_ZIP   = BUILD / "lgbm_src.zip"

In [10]:
# --- Load (preprocessor, booster_str) from your joblib ---
preprocessor, booster_str = joblib.load(JOBLIB)
booster = lgb.Booster(model_str=booster_str)

In [11]:
booster.save_model(str(TXT_MODEL), num_iteration=-1)
print(f"Saved LightGBM model to {TXT_MODEL}")

# --- Treelite: load and export C/C++ sources ---
model = treelite.frontend.load_lightgbm_model(TXT_MODEL)
os.makedirs("./ML Models/build", exist_ok=True)

Saved LightGBM model to ML Models/lgbm.txt


In [12]:
model = load_lightgbm_model(str(TXT_MODEL))


tl2cgen.export_srcpkg(
    model,
    toolchain="gcc",
    pkgpath=str(PKG_ZIP),           # ZIP will be: ./ML Models/build/lgbm_src.zip
    libname="lgbm",
    params={
        "quantize": 0,              # you asked for NO quantization
        "parallel_comp": 3          # split across files for easier Arduino builds
    },
)
print(f"Wrote {PKG_ZIP}")

[20:06:24] /project/src/compiler/ast/split.cc:35: Parallel compilation enabled; member trees will be divided into 3 translation units.
Wrote ML Models/build/lgbm_src.zip


[20:06:24] /tmp/tmpfk5q71dd/libbuild/_deps/treelite-src/src/serializer.cc:202: The model you are loading originated from a newer Treelite version; some functionalities may be unavailable.
Currently running Treelite version 4.1.2
The model checkpoint was generated from Treelite version 4.4.1


In [18]:
from treelite.frontend import load_xgboost_model
import xgboost as xgb
import treelite

In [19]:
# --- Paths (everything inside ./ML Models) ---
BASE      = Path("./ML Models")
BASE.mkdir(parents=True, exist_ok=True)
BUILD     = BASE / "build"
BUILD.mkdir(parents=True, exist_ok=True)

JOBLIB    = BASE / "best_xgb_model_100.joblib"
XGB_JSON  = BASE / "xgb.json"
PKG_ZIP   = BUILD / "xgb_src.zip"


In [20]:
# --- Load your pipeline and extract parts ---
pipeline = joblib.load(JOBLIB)
classifier = pipeline.named_steps["classifier"]   # XGBClassifier
preprocessor = pipeline.named_steps["preprocessor"]

In [21]:
# --- Save Booster to JSON (Treelite prefers JSON) ---
booster: xgb.Booster = classifier.get_booster()
# If you trained with early-stopping, best_ntree_limit is embedded in the JSON metadata
booster.save_model(str(XGB_JSON))
print(f"Saved XGBoost model to {XGB_JSON}")

Saved XGBoost model to ML Models/xgb.json


In [22]:
# --- Treelite: load and export C/C++ sources ---
model = load_xgboost_model(str(XGB_JSON))

In [30]:
tl2cgen.export_srcpkg(
    model,
    toolchain="gcc",
    pkgpath=str(PKG_ZIP),     # => ./ML Models/build/xgb_src.zip
    libname="xgb",
    params={
        "quantize": 0,        # keep float thresholds (you preferred no quantization)
        "parallel_comp": 3    # split into multiple .c files → easier Arduino builds
    }
)
print(f"Wrote {PKG_ZIP}")

[23:43:44] /project/src/compiler/ast/split.cc:35: Parallel compilation enabled; member trees will be divided into 3 translation units.
Wrote ML Models/build/xgb_src.zip


[23:43:44] /tmp/tmpfk5q71dd/libbuild/_deps/treelite-src/src/serializer.cc:202: The model you are loading originated from a newer Treelite version; some functionalities may be unavailable.
Currently running Treelite version 4.1.2
The model checkpoint was generated from Treelite version 4.4.1


In [29]:
i = 0  # choose a fixed row index you will paste to the MCU
x0 = preprocessor.transform(X_test[i:i+1]).astype(np.float32)
proba = classifier.predict_proba(x0)[0]        # shape (K,)
pred  = int(proba.argmax())
print("PY idx:", i, "pred:", pred, "label:", classifier.classes_[pred])
print("top-3:", sorted(enumerate(proba), key=lambda t:t[1], reverse=True)[:3])

# Emit the exact C array to paste into Arduino:
def to_c(a, var="Xf", dim=None, per_line=8):
    if dim is None: dim = a.size
    def fmt(v): return "0.0f" if v == 0 else ("1.0f" if v == 1 else f"{float(v):.8g}f")
    vals=[fmt(v) for v in a.ravel()]
    lines=[", ".join(vals[i:i+per_line]) for i in range(0,len(vals),per_line)]
    return f"static const float {var}[{dim}] = {{\n  " + ",\n  ".join(lines) + "\n};"
print("INPUT_DIM =", x0.size)
print(to_c(x0))

PY idx: 0 pred: 3 label: 3
top-3: [(3, 0.99997675), (10, 3.677209e-06), (8, 3.6407546e-06)]
INPUT_DIM = 94
static const float Xf[94] = {
  -1.4116956f, -0.18476674f, -0.039243355f, -0.57866377f, -0.51474041f, -0.54112536f, -0.32181111f, 0.12058315f,
  0.12065638f, 0.12062006f, 0.36582685f, -0.57956153f, -0.0058834325f, -0.30279472f, -0.44441816f, 0.172683f,
  -0.044832811f, -0.38126367f, 0.35925177f, 0.59046739f, -0.55538052f, -0.49170452f, -0.55805647f, -0.098239638f,
  0.0f, -0.016975235f, -0.016448833f, 1.0054787f, -0.65628535f, -0.59171414f, -0.18199365f, -0.62993717f,
  -0.26037058f, -0.74278361f, -0.21096753f, -0.60074139f, -0.71686429f, -0.27978122f, -0.69918746f, -0.31448436f,
  -0.55644727f, -0.54357797f, -0.01058036f, -0.26964867f, -0.039033215f, -0.10054152f, -0.16925305f, -0.016220458f,
  -0.30149484f, -0.040323764f, -0.07476715f, -0.11610999f, -0.025493743f, -0.26883566f, -0.039261725f, -0.13477188f,
  -0.16623086f, 0.13782649f, -0.57980216f, -0.46938577f, -0.61824411f, -0

In [25]:
labels = [str(x) for x in classifier.classes_.tolist()]
with open("./ML Models/xgb_labels.h", "w") as f:
    f.write("#pragma once\n")
    f.write(f"static const int XGB_NUM_CLASSES = {len(labels)};\n")
    f.write("static const char* XGB_LABELS[] = {\n  ")
    f.write(",\n  ".join(f"\"{lbl}\"" for lbl in labels))
    f.write("\n};\n")
print("Wrote xgb_labels.h")

Wrote xgb_labels.h


In [13]:
# --- Paths (everything inside ./ML Models) ---
BASE      = Path("./ML Models")
BASE.mkdir(parents=True, exist_ok=True)
BUILD     = BASE / "build"
BUILD.mkdir(parents=True, exist_ok=True)

JOBLIB    = BASE / "best_rf_model_100.joblib"
RF_JSON  = BASE / "rf.json"
PKG_ZIP   = BUILD / "rf_src.zip"


In [14]:
# --- Load your pipeline and extract parts ---
pipeline = joblib.load(JOBLIB)
classifier = pipeline.named_steps["classifier"]   # XGBClassifier
preprocessor = pipeline.named_steps["preprocessor"]

In [15]:
model = treelite.sklearn.import_model(classifier)

In [16]:
# Export to C
tl2cgen.export_srcpkg(
    model,
    toolchain="gcc",
    pkgpath=str(BUILD / "rf_src.zip"),
    libname="rf",
    params={"quantize": 0, "parallel_comp": 3}
)
print("Wrote RF C package to", BUILD / "rf_src.zip")

[20:12:09] /tmp/tmpfk5q71dd/libbuild/_deps/treelite-src/src/serializer.cc:202: The model you are loading originated from a newer Treelite version; some functionalities may be unavailable.
Currently running Treelite version 4.1.2
The model checkpoint was generated from Treelite version 4.4.1


[20:12:09] /project/src/compiler/ast/split.cc:35: Parallel compilation enabled; member trees will be divided into 3 translation units.
Wrote RF C package to ML Models/build/rf_src.zip


In [17]:
# (Optional) write labels for on-device printing
labels = [str(x) for x in classifier.classes_.tolist()]
with open(BUILD / "rf_labels.h", "w") as f:
    f.write("#pragma once\n")
    f.write(f"static const int RF_NUM_CLASSES = {len(labels)};\n")
    f.write("static const char* RF_LABELS[] = {\n  ")
    f.write(",\n  ".join(f"\"{lbl}\"" for lbl in labels))
    f.write("\n};\n")
print("Wrote", BUILD / "rf_labels.h")

Wrote ML Models/build/rf_labels.h
