# Supervised ML for anomaly detection in IOT to enahnce network security
## Part 3 - DATA TRAINING

The IoT-23 dataset is a collection of network traffic from Internet of Things (IoT) devices. It includes 20 malware captures executed in IoT devices, and 3 hotspot captures for benign IoT devices traffic12. The 3 hotspot captures are not being included in the data cleaning because this feature was not considered relevant for the specific analysis being performed.

In this notebook, we load the raw dataset file and implement initial cleaning to prepare it for the next processing phase.

> **INPUT:** downloaded the raw dataset file from its original source. <br>
> **OUTPUT:** a cleaned version of the dataset stored to an intermediate csv file.

***

In [2]:
# Import necessary libraries and modules
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import precision_score, confusion_matrix, recall_score, accuracy_score, f1_score
from statistics import mean
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from joblib import dump

In [3]:
# Set display options
pd.set_option('display.max_columns', None)

In [6]:
# Read the dataset
data_df = pd.read_csv('../data/processed/iot23_combined_processed.csv', index_col=0)

In [7]:
# Check dataset shape
data_df.shape

(1444674, 17)

In [None]:
# Check dataset head
data_df

In [None]:
# Split data into independent and dependent variables
data_X = data_df.drop("label", axis=1)
data_y = data_df["label"]

In [None]:
# Initialize classification models
classifiers = [
    # Since we have unbalanced labels, we use the Complement version of Naive Bayes which is particularly suited for imbalanced data sets.
    ("Naive Bayes", ComplementNB()),
    
    # We use the Decision Tree with its default parameters, including the "Gini Impurity" to measure the quality of splits and ccp_alpha=0 (no pruning is performed). 
    ("Decision Tree", DecisionTreeClassifier()),
    
    # Logistic Regression model to help discovering linearity separation in the data set.
    ("Logistic Regression", LogisticRegression()),
    
    # The efficient Random Forest model with a default base estimators of 100.
    ("Random Forest", RandomForestClassifier()),
    
    # The classifier version of Support Vector Machine model.
    ("Support Vector Classifier", SVC()),
    
    # The distance-based KNN classifier with a default n_neighbors=5.
    ("K-Nearest Neighbors", KNeighborsClassifier()),
  
    # The most powerful ensemble model of XGBoost with some initially tuned hyperparameters.
    ("XGBoost", xgb.XGBClassifier(objective = "binary:logistic", alpha = 10)),
]

In [None]:
# Initialize the cross-validator with 5 splits and sample shuffling activated
skf_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
print("Model Training Started!")
# Initialize the results summary
classification_results = pd.DataFrame(index=[c[0] for c in classifiers], columns=["Accuracy", "TN", "FP", "FN", "TP", "Recall", "Precision", "F1"])

# Iterate over the estimators
for est_name, est_object in classifiers:
    
    print(f"### [{est_name}]: Processing ...")
    
    # Initialize the results for each classifier
    accuracy_scores = []
    confusion_matrices = []
    recall_scores = []
    precision_scores = []
    f1_scores = []
    
    # Initialize best model object to be saved
    models_path = "..\\models"
    best_model = None
    best_f1 = -1
    
    # Iterate over the obtained folds
    for train_index, test_index in skf_cv.split(data_X, data_y):

        # Get train and test samples from the cross-validation model
        X_train, X_test = data_X.iloc[train_index], data_X.iloc[test_index]
        y_train, y_test = data_y.iloc[train_index], data_y.iloc[test_index]
        
        # Train the model
        est_object.fit(X_train.values, y_train.values)
        
        # Predict the test samples
        y_pred = est_object.predict(X_test.values)
        
        # Calculate and register accuracy metrics
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        confusion_matrices.append(confusion_matrix(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        est_f1_score = f1_score(y_test, y_pred)
        f1_scores.append(est_f1_score)
        
        # Compare with best performing model
        if best_f1 < est_f1_score:
            best_model = est_object
            best_f1 = est_f1_score
    
    # Summarize the results for all folds for each classifier
    tn, fp, fn, tp = sum(confusion_matrices).ravel()
    classification_results.loc[est_name] = [mean(accuracy_scores),tn,fp,fn,tp,mean(recall_scores),mean(precision_scores),mean(f1_scores)]
    
    # Save the best performing model
    if best_model:
        model_name = est_name.replace(' ', '_').replace('-', '_').lower()
        model_file = model_name + ".pkl"
        dump(best_model, models_path + "\\" + model_file)
    
print("Model Training Finished!")   

In [None]:
# Check the results
classification_results