# Settings

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

ROOT = "/content/drive/25-2-Machine-Learning-Onions/"
import sys
sys.path.append(ROOT)

In [None]:
import pickle
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

from src.train_tuning import Model

is_binary = True
model_wrapper = Model(is_binary=is_binary)

In [None]:
all_features = ['mean_iat', 'std_iat', 'median_iat', 'q25_iat', 'q75_iat', 'ratio_out',
       'switch_count', 'duration', 'n_packets', 'burst_count', 'burst_mean',
       'burst_std', 'burst_max', 'burst_ratio_in', 'num_in', 'num_out',
       'frac_in', 'in_first30', 'out_first30', 'mean_iat_first30',
       'std_iat_first30', 'median_iat_first30', 'q25_iat_first30',
       'q75_iat_first30', 'ratio_out_first30', 'switch_count_first30',
       'duration_first30', 'burst_count_first30', 'burst_mean_first30',
       'burst_std_first30', 'burst_max_first30', 'burst_ratio_in_first30',
       'frac_in_first30'] # fixed
model_params = {
    "logit": {
        "params": {'C': 1, 'penalty': 'l2'},
        "features": all_features,
    },
    "rf": {
        "params": {'max_depth': None, 'subsample': None, 'n_estimators': 400},
        "features": all_features,
    },
    "lgb": {
        "params": {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'subsample': 1.0},
        "features": all_features,
    },
    "cat": {
        "params": {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'subsample': 1.0},
        "features": all_features,
    },
    "xgb": {
        "params": {'learning_rate': 0.5, 'max_depth': 10, 'num_parallel_tree':10},
        "features": all_features,
    },
    "svm": {
        "params": {"C": 10, "gamma": 0.1, "kernel": "rbf"},
        "features": all_features,
    },
}

# Load Data

In [None]:
# Load the pickle file
print("Loading datafile...")
with open(f"./data/preprocessed/open_train_33.pkl", "rb") as f:
    train = pickle.load(f)
    X_train = pd.DataFrame(train["X"])
    X_train.columns = train["feature_names"]
    y_train = train["y_binary"]
    y_train = np.where(y_train == -1, 0, y_train)

with open(f"./data/preprocessed/open_val_33.pkl", "rb") as f:
    val = pickle.load(f)
    X_val = pd.DataFrame(val["X"])
    X_val.columns = val["feature_names"]
    y_val = val["y_binary"]
    y_val = np.where(y_val == -1, 0, y_val)

print("Loading complete.")
print("Training data size:\t", len(X_train))
print("Validation data size:\t", len(X_val))

In [None]:
# feature selection
def feature_selection(X, selected_features):
  return X[selected_features]

# Training

In [None]:
type = "stacked"

In [None]:
# base model
estimators = []
print("[Load base models]")
for type, value in model_params.items():
  model_wrapper.create_model(type, **value.get('params'))
  model = model_wrapper._model
  model_nm = model_wrapper.code
  features = value.get('features')
  print(f"- {model_nm}:\n   - params: {model.get_params()}\n   - feature: {len(features)} features")
  transformer = ColumnTransformer([
      ('selector', 'passthrough', features)
  ])
  pipeline = Pipeline([
    ('preprocessor', transformer),
    ('classifier', model_wrapper._model)
  ])
  estimators.append((model_nm, pipeline))

# meta model
print("\n[Load meta model]")
meta_model = LogisticRegression()
meta_model.fit(X_train, y_train)

In [None]:
stacked_model = StackingClassifier(estimators=estimators, final_estimator=meta_model)
stacked_model.fit(X_train, y_train)

model_wrapper.code = type
model_wrapper.save_model(model=stacked_model, file_name="")