## Required Files

To run this program, you need an "all_data.csv" file. The "all_data.csv" file should be located in the same directory as the program.

## Program Purpose

The purpose of this program is to apply various machine learning algorithms to a dataset and observe their performance. The algorithms used in this program are:

- Naive Bayes
- QDA (Quadratic Discriminant Analysis)
- Random Forest
- ID3 (Iterative Dichotomiser 3)
- AdaBoost
- MLP (Multi-Layer Perceptron)
- Nearest Neighbors

The program's output includes the following information for each algorithm:

- File name
- Machine learning algorithm name
- Accuracy
- Precision
- Recall
- F1-score
- Time taken

Additionally, the program will generate a CSV file containing the results and a folder containing graphics.

## Attribution

Some portions of the code used for calculations and graphing have been adapted from the [scikit-learn website](http://scikit-learn.org).


In [7]:

from sklearn import metrics
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import average_precision_score, confusion_matrix, roc_curve, roc_auc_score, auc
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

          
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
import os
import pandas as pd
import csv
import time
import warnings
import math
import pickle
warnings.filterwarnings("ignore")


## test saving best model

In [6]:

result="./results/results_3.csv" #a CSV file is named in which the results are saved.
csv_files=["all_data.csv"]# CSV files names: #The names of the dataset files (csv_files).
path=""
repetition=10


def folder(f_name): #this function creates a folder named "results" and "result_graph_1" in the program directory.
    try:
        if not os.path.exists(f_name):
            os.makedirs(f_name)
    except OSError:
        print ("The folder could not be created!")

folder_name="./results/"
folder(folder_name)
folder_name="./results/result_graph_3/"
folder(folder_name)


#The machine learning algorithms to be used are defined in a dictionary (ml_list).
ml_list={
"Naive Bayes":GaussianNB(),
"QDA":QDA(),
"XGB":XGBClassifier(n_estimators=100, learning_rate=0.01, objective='binary:logistic'),
"Random Forest":RandomForestClassifier(max_depth=5, n_estimators=100, max_features=2, n_jobs=-1),
"AdaBoost":AdaBoostClassifier(),
"MLP":MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500),
"Nearest Neighbors":KNeighborsClassifier(3, n_jobs=-1)}



#list of all columns to be imported
# the 7 features with the highest importance weight selected by the file "04_2_feature_selection_for_attack_files.py" are used here. (+ Label Feature)

features={"all_data":["Bwd Packet Length Std", "Flow Bytes/s", "Total Length of Fwd Packets", "Fwd Packet Length Std",
     "Flow IAT Std", "Flow IAT Min", "Fwd IAT Total","Label"]}

seconds=time.time()#time stamp for all processing time


with open(result, "w", newline="",encoding="utf-8") as f:#a CSV file is created to save the results obtained.
    wrt = csv.writer(f)
    wrt.writerow(["File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"])

best_model = {
    'algorithm': None,
    'roc_auc': 0.0,  # Initialize with a low value
    'model': None,    # Store the trained model object
}

for j in csv_files: #this loop runs on the list containing the filenames.Operations are repeated for all attack files
    print ('%-15s %-15s  %-12s %-12s %-12s %-12s %-12s %-12s' % ("File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time", "roc_auc"))# print output header   
    feature_list=list(features[j[0:-4]])
    df=pd.read_csv(path+j,usecols=feature_list)#read an attack file.
    df=df.fillna(0)
    attack_or_not=[]
    for i in df["Label"]: #it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm
        if i =="BENIGN":
            attack_or_not.append(1)
        else:
            attack_or_not.append(0)           
    df["Label"]=attack_or_not

    
    y = df["Label"] #this section separates the label and the data into two separate pieces, as Label=y Data=X 
    del df["Label"]
    feature_list.remove('Label')
    X = df[feature_list]

    
    for ii in ml_list: #this loop runs on the list containing the machine learning algorithm names. Operations are repeated for all the 7 algorithm
        precision=[]
        recall=[]
        f1=[]
        accuracy=[]
        t_time=[]
        roc_auc = []
        for i in range(repetition): # This loop allows cross-validation and machine learning algorithm to be repeated 10 times
            second=time.time()#time stamp for processing time

            # cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y,#  data (X) and labels (y) are divided into 2 parts to be sent to the machine learning algorithm (80% train,%20 test). 
                test_size = 0.20, random_state = repetition)#  So, in total there are 4 tracks: training data(X_train), training tag (y_train), test data(X_test) and test tag(y_test).

            #machine learning algorithm is applied in this section
            clf = ml_list[ii]#choose algorithm from ml_list dictionary                                                                          
            clf.fit(X_train, y_train)
            predict =clf.predict(X_test)
        
            #makes "classification report" and assigns the precision, f-measure, and recall values.s.    
            
            f_1=f1_score(y_test, predict, average='macro')
            pr=precision_score(y_test, predict, average='macro')
            rc=recall_score(y_test, predict, average='macro')

            precision.append(float(pr))
            recall.append(float(rc))
            f1.append(float(f_1))
            accuracy.append(clf.score(X_test, y_test))
            t_time.append(float((time.time()-second)) )
            
            y_scores = clf.predict_proba(X_test)[:, 1]  # Use predicted probabilities for positive class
            fpr, tpr, _ = roc_curve(y_test, y_scores)
            roc_auc.append(auc(fpr, tpr))
            
        # Plot ROC curve
        plt.figure(figsize=(5,3))
        plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % np.mean(roc_auc))
        plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic - ' + str(ii))
        plt.legend(loc="lower right")
        plt.show()
        
        cm = confusion_matrix(y_test, predict)

        # Plot confusion matrix
        plt.figure(figsize=(3, 3))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["BENIGN", "Attack"], yticklabels=["BENIGN", "Attack"])
        plt.xlabel("Predicted Labels")
        plt.ylabel("True Labels")
        plt.title(f"Confusion Matrix - {ii}")
        plt.show()


            
        print ('%-15s %-15s  %-12s %-12s %-12s %-12s %-12s %-12s' % (j[0:-4],ii,str(round(np.mean(accuracy),2)),str(round(np.mean(precision),2)), 
            str(round(np.mean(recall),2)),str(round(np.mean(f1),2)),str(round(np.mean(t_time),4)),str(round(np.mean(roc_auc),4))))#the result of the ten repetitions is printed on the screen.
        if np.mean(roc_auc) > best_model['roc_auc']:
            best_model['algorithm'] = ii
            best_model['roc_auc'] = np.mean(roc_auc)
            best_model['model'] = clf

        with open(result, "a", newline="",encoding="utf-8") as f: # all the values found are saved in the opened file.
            wrt = csv.writer(f)
            for i in range(0,len(t_time)):
                wrt.writerow([j[0:-4],ii,accuracy[i],precision[i],recall[i],f1[i],t_time[i]])#file name, algorithm name, precision, recall and f-measure are writed in CSV file
   
        
if best_model['model'] is not None:
    best_model_filename = 'best_model.pkl'
    with open(best_model_filename, 'wb') as model_file:
        pickle.dump(best_model['model'], model_file)
    print(f'Best model ({best_model["algorithm"]}) saved as {best_model_filename}')
else:
    print('No best model found.')

print("Total operation time: = ",time.time()- seconds ,"seconds")


File            ML algorithm     accuracy     Precision    Recall       F1-score     Time         roc_auc     


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [5]:
df = pd.read_csv('all_data.csv')

In [6]:
df.columns

Index(['Flow ID', 'Source IP', 'Source Port', 'Destination IP',
       'Destination Port', 'Protocol', 'Timestamp', 'Flow Duration',
       'Total Fwd Packets', 'Total Backward Packets',
       'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
  

In [9]:
df[["Bwd Packet Length Std", "Flow Bytes/s", "Total Length of Fwd Packets", "Fwd Packet Length Std", "Flow IAT Std", "Flow IAT Min", "Fwd IAT Total", 'Label']]


Unnamed: 0,Bwd Packet Length Std,Flow Bytes/s,Total Length of Fwd Packets,Fwd Packet Length Std,Flow IAT Std,Flow IAT Min,Fwd IAT Total,Label
0,0.0,3000000,12.0,0.00000,0.000000,4.0,4.0,BENIGN
1,0.0,12000000,12.0,0.00000,0.000000,1.0,1.0,BENIGN
2,0.0,12000000,12.0,0.00000,0.000000,1.0,1.0,BENIGN
3,0.0,12000000,12.0,0.00000,0.000000,1.0,1.0,BENIGN
4,0.0,4000000,12.0,0.00000,0.000000,3.0,3.0,BENIGN
...,...,...,...,...,...,...,...,...
2672992,0.0,0,0.0,0.00000,7.778175,37.0,0.0,BENIGN
2672993,0.0,106194,12.0,0.00000,0.000000,113.0,113.0,BENIGN
2672994,0.0,0,0.0,0.00000,0.000000,115.0,0.0,BENIGN
2672995,0.0,773,148.0,85.44784,54961.659690,47.0,95825.0,BENIGN


In [13]:
import pickle

# Load the trained model
model_path = '/home/darkstar/git_repos/intrusion/front_end/model/model.pkl'  # Replace with the actual path to your model file

with open(model_path, 'rb') as model_file:
    loaded_model = pickle.load(model_file)
dat = df[["Bwd Packet Length Std", "Flow Bytes/s", "Total Length of Fwd Packets", "Fwd Packet Length Std", "Flow IAT Std", "Flow IAT Min", "Fwd IAT Total"]]


    # Example input data for prediction
# Use the loaded model to make predictions
predictions = loaded_model.predict(dat)
print(predictions)

[1 1 1 ... 1 1 1]


In [17]:
# Find unique values and their counts
unique_values, counts = np.unique(predictions, return_counts=True)

# Print unique values and their counts
for value, count in zip(unique_values, counts):
    print(f"Value: {value}, Count: {count}")


Value: 0, Count: 467125
Value: 1, Count: 2205872
