###  importing relevant libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
%matplotlib inline
from numpy import array
from random import random
from sklearn import metrics
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB#57
from sklearn.naive_bayes import GaussianNB#52
from sklearn.naive_bayes import MultinomialNB#56
from sklearn.naive_bayes import CategoricalNB

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
import csv
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import sklearn
import time



### Discovering Labels

In [3]:
# 提取指定文件 Label 列中的唯一值并排序
def target_name(name):
    df = pd.read_csv(name,usecols=["Label"])
    target_names=sorted(list(df["Label"].unique()))
    return target_names

### Hyperparameters of machine learning algorithm.

In [4]:
# 定义了一个字典 ml_list，其中包含了多种机器学习模型的实例化对象。
# 代码中使用了 scikit-learn 库中的多个分类模型，并设置了不同的超参数。
from sklearn.multiclass import OneVsRestClassifier
ml_list={"NB": OneVsRestClassifier(CategoricalNB(alpha=1e-09)),
    "DT": OneVsRestClassifier(DecisionTreeClassifier(criterion='gini', max_depth=26,
                       max_features=26, min_samples_split=6)),
    "RF": OneVsRestClassifier(RandomForestClassifier(bootstrap=True, criterion="gini", max_depth=18, max_features=8, min_samples_split=9, n_estimators=96)),}

# Aggregation Algorithm notmal

In [5]:
altime=0
#def most_frequent(List): 
#    return max(set(List), key = List.count) 


# 找出列表中出现频率最高的元素。如果有多个元素具有相同的最高频率，则随机返回其中一个
def most_frequent(List):
    occurence_count = Counter(List)
    occurence_count={k: v for k, v in sorted(occurence_count.items(), key=lambda item: item[1],reverse=True)}
    big=list(occurence_count.values())
    big=big.count(big[0])
    return list(occurence_count.keys())[np.random.randint(big)]

# 将列表 a 分成 n 个大致相等的部分
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
# 找出 df 中具有重复 "dominant MAC" 的异常值，并返回这些异常值的列表
def create_exception(df): 
    exception_list=[]
    dominant_mac=[]
    for i in df['aggregated'].unique():
        k=df[df['aggregated']==i]
        for ii in ['MAC']:
            hist = {}
            for x in k[ii].values:
                hist[x] = hist.get(x, 0) + 1
            hist=dict(sorted(hist.items(), key=lambda item: item[1],reverse=True))
            temp=next(iter(hist))
            if temp not in dominant_mac:
                dominant_mac.append(temp)
            else:
                exception_list.append(temp)
    return exception_list



# 将 m_test 和 predict 合并，生成一个新的分类结果 aggregated。如果 mixed 为 True，则使用混合方法处理异常值
def merged(m_test,predict,step,mixed):
    second=time.time()
    mac_test=[]
    for q in m_test.index:
        mac_test.append(m_test[q])

    d_list=sorted(list(m_test.unique()))
    devices={}
    for q in d_list:
        devices[q]=[]    


    new_y=[0]*len(m_test)

    for q,qq in enumerate (mac_test):
        devices[qq].append(q)
    for q in devices:
        a = [devices[q][j:j + step] for j in range(0, len(devices[q]), step)]  
        for qq in a:
            step_list=[]
            for qqq in qq:
                step_list.append(predict[qqq])
            add=most_frequent(list(step_list))
            for qqq in qq:
                new_y[qqq]=add
    results=pd.DataFrame(m_test)
    results["aggregated"]=new_y
    results["normal"]=predict
    
    #MIXED METHOD
    if mixed:
        exception=create_exception(results)
        for q in exception:
            results.loc[results.MAC == q, 'aggregated'] = results['normal']

    return results["aggregated"].values,time.time()-second














## Calculation of evaluations

In [6]:
# 计算和记录分类模型的性能指标，包括准确率（accuracy）、召回率（recall）、精确率（precision）、
# F1 分数（f1-score）、平衡准确率（balanced accuracy）、Cohen's Kappa 系数等。
# 使用 classification_report 生成分类报告，并累加存储为 DataFrame
def score(altime,train_time,test_time,predict,y_test,class_based_results,i,cv,dname,ii):
    precision=[]
    recall=[]
    f1=[]
    accuracy=[]
    total_time=[]
    kappa=[]
    accuracy_b=[]
    
    rc=sklearn.metrics.recall_score(y_test, predict,average= "macro")
    pr=sklearn.metrics.precision_score(y_test, predict,average= "macro")
    f_1=sklearn.metrics.f1_score(y_test, predict,average= "macro")        
    report = classification_report(y_test, predict, target_names=target_names,output_dict=True)
    cr = pd.DataFrame(report).transpose()
    if class_based_results.empty:
        class_based_results =cr
    else:
        class_based_results = class_based_results.add(cr, fill_value=0)
    precision.append(float(pr))
    recall.append(float(rc))
    f1.append(float(f_1))
    accuracy_b.append(balanced_accuracy_score( y_test,predict))
    accuracy.append(accuracy_score(y_test, predict))

    kappa.append(round(float(sklearn.metrics.cohen_kappa_score(y_test, predict, 
    labels=None, weights=None, sample_weight=None)),15))
    print ('%-15s %-3s %-3s %-6s  %-5s %-5s %-5s %-5s %-8s %-5s %-8s %-8s%-8s%-8s' % (dname,i,cv,ii[0:6],str(round(np.mean(accuracy),2)),str(round(np.mean(accuracy_b),2)),
        str(round(np.mean(precision),2)), str(round(np.mean(recall),2)),str(round(np.mean(f1),4)), 
        str(round(np.mean(kappa),2)),str(round(np.mean(train_time),2)),str(round(np.mean(test_time),2)),str(round(np.mean(test_time)+np.mean(train_time),2)),str(round(np.mean(altime),2))))
    lines=(str(dname)+","+str(i)+","+str(cv)+","+str(ii)+","+str(round(np.mean(accuracy),15))+","+str(round(np.mean(accuracy_b),15))+","+str(round(np.mean(precision),15))+","+ str(round(np.mean(recall),15))+","+str(round(np.mean(f1),15))+","+str(round(np.mean(kappa),15))+","+str(round(np.mean(train_time),15))+","+str(round(np.mean(test_time),15))+","+str(altime)+"\n")
    return lines,class_based_results

In [7]:
# loop1, loop2: 训练数据和测试数据的文件路径。
# output_csv: 用于记录结果的 CSV 文件路径。
# cols: 需要读取的列。
# step: 分组的大小（用于 merged 函数）。
# mixed: 是否使用混合方法（用于 merged 函数）。
# dname: 数据集名称。
import nltk  # 导入 nltk 以计算编辑距离
import Levenshtein
def ML(loop1,loop2,output_csv,cols,step,mixed,dname):

    ths = open(output_csv, "w")
    ths.write("Dataset,T,CV,ML algorithm,Acc,b_Acc,Precision, Recall , F1-score, kappa ,tra-Time,test-Time,Al-Time\n")
    

    from sklearn.metrics import balanced_accuracy_score
    from sklearn.preprocessing import Normalizer
    
    for ii in ml_list:
        print ('%-15s %-3s %-3s %-6s  %-5s %-5s %-5s %-5s %-8s %-5s %-8s %-8s%-8s%-8s'%
               ("Dataset","T","CV","ML alg","Acc","b_Acc","Prec", "Rec" , "F1", "kap" ,"tra-T","test-T","total","al-time"))
        class_based_results=pd.DataFrame()#"" #pd.DataFrame(0, index=np.arange((len(target_names)+3)), columns=["f1-score","precision","recall","support"])
        cm=pd.DataFrame()
        cv=0
        if ii in ["GB","SVM"]: #for slow algorithms.
            repetition=3
        else:
            repetition=3
        if ii in ["MLP"]: #for slow algorithms.
            repetition=1

        for i in range(repetition):



            #TRAIN
            df = pd.read_csv(loop1,usecols=cols)
            try:df=df.replace({"Protocol": Protocol})
            except:pass
            m_train=df["MAC"]
            del df["MAC"]
            X_train =df[df.columns[0:-2]]
            X_train=np.array(X_train)
            df[df.columns[-1]] = df[df.columns[-1]].astype('category')
            y_train=df[df.columns[-1]].cat.codes  
            
            # 提取训练样本的 nilsimsa_hash 并按类别存储
            train_nilsimsa = df[df.columns[-2]]  # nilsimsa_hash 列
            hash_dict = {}
            for label, hash_value in zip(y_train, train_nilsimsa):
                if label not in hash_dict:
                    hash_dict[label] = []
                hash_dict[label].append(hash_value)

            #TEST
            df = pd.read_csv(loop2,usecols=cols)
            try:df=df.replace({"Protocol": Protocol})
            except:pass
            df = shuffle(df)
            m_test=df["MAC"]
            del df["MAC"]
            X_test =df[df.columns[0:-2]]
            X_test=np.array(X_test)
            df[df.columns[-1]] = df[df.columns[-1]].astype('category')
            y_test=df[df.columns[-1]].cat.codes
            
            # 提取测试样本的 nilsimsa_hash
            test_nilsimsa = df[df.columns[-2]]  # nilsimsa_hash 列




            results_y=[]
            cv+=1
            results_y.append(y_test)


     
   

            #machine learning algorithm is applied in this section
            clf = ml_list[ii]#choose algorithm from ml_list dictionary
            second=time.time()
            clf.fit(X_train, y_train)
            train_time=(float((time.time()-second)) )
            second=time.time()
            probabilities = clf.predict_proba(X_test)  # 获取概率矩阵
            test_time=(float((time.time()-second)) )
            
            # 自定义预测逻辑
            predict = []
            threshold = 0.001  # 阈值
            for sample_idx in range(X_test.shape[0]):
                prob = probabilities[sample_idx]
                top_classes = np.argsort(prob)[::-1]  # 从高到低排序
                max_class = top_classes[0]  # 概率最高的类别
                max_prob = prob[max_class]  # 最高概率值
            
                # 初始化变量
                chosen_class = max_class  # 默认选择概率最高的类别
                min_distance = float('inf')  # 初始化最小编辑距离为无穷大
                test_hash = test_nilsimsa.iloc[sample_idx]  # 当前测试样本的 nilsimsa_hash
            
                # 遍历所有类别，找出与最高概率的差值小于阈值的类别
                for class_idx in top_classes:
                    if max_prob - prob[class_idx] < threshold:
                        # 当前类别的 nilsimsa_hash 列表
                        class_hashes = hash_dict[class_idx]
                        if class_hashes:
                            # 计算当前类别的最小编辑距离
                            min_distance_current = min(Levenshtein.distance(test_hash, h) for h in class_hashes)
                            # 如果当前类别的最小编辑距离小于已记录的最小编辑距离，更新选择
                            if min_distance_current < min_distance:
                                min_distance = min_distance_current
                                chosen_class = class_idx
                    else:
                        # 如果当前类别的概率差已经大于阈值，停止进一步检查
                        break
            
                predict.append(chosen_class)
            
            predict = np.array(predict)
            
            if step==1:
                altime=0
                lines,class_based_results=score(altime,train_time,test_time,predict,y_test,class_based_results,i,cv,dname,ii)
            else:
                predict,altime=merged(m_test,predict,step,mixed)
                lines,class_based_results=score(altime,train_time,test_time,predict,y_test,class_based_results,i,cv,dname,ii)
            ths.write (lines)


            df_cm = pd.DataFrame(confusion_matrix(y_test, predict))
            # print(df_cm)
            

            # 绘制混淆矩阵
            plt.figure(figsize=(10, 8))  # 设置图片大小
            sns.heatmap(df_cm, annot=True, fmt="d", cmap="Blues", linewidths=.5, cbar=True, xticklabels=target_names, yticklabels=target_names)
            
            # 添加标题和标签
            plt.title(f"Confusion Matrix - {ii} (CV={cv}, Rep={i})")
            plt.xlabel("Predicted Labels")
            plt.ylabel("True Labels")
            
            # 保存为图片
            confusion_matrix_image = f"./Aalto_test/confusion_matrix_{ii}_cv{cv}_rep{i}.png"
            plt.savefig(confusion_matrix_image)
            plt.close()  # 关闭图片窗口，避免内存泄漏
            
            if cm.empty:
                cm =df_cm
            else:
                cm = cm.add(df_cm, fill_value=0)
            
        class_based_results=class_based_results/repetition
        #print(class_based_results)
        class_based_results.to_csv("class_based_results.csv")
        if False:
            cm=cm//repetition
            graph_name=output_csv+ii+"_confusion matrix.pdf"   
            plt.figure(figsize = (40,28))
            sns.heatmap(cm,xticklabels=target_names, yticklabels=target_names, annot=True, fmt='g')
            plt.savefig(graph_name,bbox_inches='tight')#, dpi=400)
            plt.show()
            #print(cm)
            print("\n\n\n")             


            
            
            
            
            
            
            


    ths.close()  

# Machine learning applications 

# Aalto Dataset

In [8]:
feature= ['pck_size', 'Ether_type', 'LLC_ctrl', 'EAPOL_version', 'EAPOL_type', 'IP_ihl', 'IP_tos', 'IP_len', 'IP_flags', 'IP_DF', 'IP_ttl', 'IP_options', 'ICMP_code', 'TCP_dataofs', 'TCP_FIN', 'TCP_ACK', 'TCP_window', 'UDP_len', 'DHCP_options', 'BOOTP_hlen', 'BOOTP_flags', 'BOOTP_sname', 'BOOTP_file', 'BOOTP_options', 'DNS_qr', 'DNS_rd', 'DNS_qdcount', 'dport_class', 'payload_bytes', 'entropy', 'nilsimsa_hash',
"MAC",
'Label']


# DT & NB & RF &  KNN

In [9]:
test='Aalto_test_IoTDevID_nilsimsa.csv'
train='Aalto_BIG_train_IoTDevID_nilsimsa.csv'


dataset="./Aalto_test/"
step=1


mixed=False
sayac=1
output_csv=dataset+str(sayac)+"_"+str(step)+"_"+str(mixed)+".csv"
target_names=target_name(test)
ML(train,test,output_csv,feature,step,mixed,dataset[2:-1]+"_"+str(step))   


Dataset         T   CV  ML alg  Acc   b_Acc Prec  Rec   F1       kap   tra-T    test-T  total   al-time 
Aalto_test_1    0   1   NB      0.62  0.63  0.59  0.63  0.5627   0.59  1.55     0.42    1.97    0.0     
Aalto_test_1    1   2   NB      0.62  0.63  0.59  0.63  0.5627   0.59  1.33     0.33    1.65    0.0     
Aalto_test_1    2   3   NB      0.62  0.63  0.59  0.63  0.5627   0.59  1.33     0.33    1.66    0.0     
Dataset         T   CV  ML alg  Acc   b_Acc Prec  Rec   F1       kap   tra-T    test-T  total   al-time 
Aalto_test_1    0   1   DT      0.73  0.74  0.78  0.74  0.751    0.71  2.32     0.07    2.39    0.0     
Aalto_test_1    1   2   DT      0.73  0.74  0.78  0.74  0.7509   0.71  2.31     0.06    2.37    0.0     
Aalto_test_1    2   3   DT      0.73  0.74  0.79  0.74  0.7525   0.71  2.34     0.06    2.4     0.0     
Dataset         T   CV  ML alg  Acc   b_Acc Prec  Rec   F1       kap   tra-T    test-T  total   al-time 


KeyboardInterrupt: 

# SVM & GB

In [None]:
# ml_list={"SVM":SVC(C=10,gamma=1),
#          "GB":GradientBoostingClassifier(learning_rate=0.001,subsample=0.1,n_estimators=500,max_depth= 10,)}

In [None]:
# test='Aalto_test_IoTDevID.csv'
# train='Aalto_BIG_train_IoTDevID.csv'
# 
# 
# 
# dataset="./Aalto/"
# step=1
# 
# 
# mixed=False
# sayac=2
# output_csv=dataset+str(sayac)+"_"+str(step)+"_"+str(mixed)+"100_svm_gb.csv"
# target_names=target_name(test)
# ML(train,test,output_csv,feature,step,mixed,dataset[2:-1]+"_"+str(step))   


# GB

In [None]:
# ml_list={"GB":GradientBoostingClassifier(learning_rate=0.001,subsample=0.1,n_estimators=500,max_depth= 10)}

In [None]:
# test='Aalto_test_IoTDevID.csv'
# train='Aalto_BIG_train_IoTDevID.csv'
# 
# 
# 
# dataset="./Aalto/"
# step=1
# 
# 
# mixed=False
# sayac=2
# output_csv=dataset+str(sayac)+"_"+str(step)+"_"+str(mixed)+"100_svm_gb_GB.csv"
# target_names=target_name(test)
# ML(train,test,output_csv,feature,step,mixed,dataset[2:-1]+"_"+str(step))   


# MLP

In [None]:
# from sklearn.neural_network import MLPClassifier
# from sklearn.datasets import make_classification
# from sklearn.model_selection import train_test_split
# 
# 
# 
# test='Aalto_test_IoTDevID.csv'
# train='Aalto_BIG_train_IoTDevID.csv'
# 
# 
# 
# dataset="./Aalto/"
# step=1
# 
# ml_list={"MLP":MLPClassifier(solver= 'adam', learning_rate= 'constant', hidden_layer_sizes= (1220, 1965), alpha= 0.1, activation= 'relu')}
# 
# mixed=False
# sayac=2
# output_csv=dataset+str(sayac)+"_"+str(step)+"_"+str(mixed)+"100_mlp.csv"
# target_names=target_name(test)
# ML(train,test,output_csv,feature,step,mixed,dataset[2:-1]+"_"+str(step))   
# 
# 
