In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import (
    accuracy_score, confusion_matrix, ConfusionMatrixDisplay,
    RocCurveDisplay, auc, precision_score, recall_score,
    f1_score, roc_curve, classification_report, roc_auc_score
)
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [2]:
train_df = pd.read_parquet('UNSW_NB15_training-set.parquet')
test_df = pd.read_parquet('UNSW_NB15_testing-set.parquet')

# Concatenate the training and test sets
df = pd.concat([train_df, test_df], ignore_index=True)

df.info()

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257673 entries, 0 to 257672
Data columns (total 36 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   dur                257673 non-null  float32 
 1   proto              257673 non-null  object  
 2   service            257673 non-null  category
 3   state              257673 non-null  object  
 4   spkts              257673 non-null  int16   
 5   dpkts              257673 non-null  int16   
 6   sbytes             257673 non-null  int32   
 7   dbytes             257673 non-null  int32   
 8   rate               257673 non-null  float32 
 9   sload              257673 non-null  float32 
 10  dload              257673 non-null  float32 
 11  sloss              257673 non-null  int16   
 12  dloss              257673 non-null  int16   
 13  sinpkt             257673 non-null  float32 
 14  dinpkt             257673 non-null  float32 
 15  sjit               257673 non-null

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,1.1e-05,udp,-,INT,2,0,496,0,90909.09375,180363632.0,...,0,0,1,1,0,0,0,0,Normal,0
1,8e-06,udp,-,INT,2,0,1762,0,125000.0,881000000.0,...,0,0,1,1,0,0,0,0,Normal,0
2,5e-06,udp,-,INT,2,0,1068,0,200000.0,854400000.0,...,0,0,1,1,0,0,0,0,Normal,0
3,6e-06,udp,-,INT,2,0,900,0,166666.65625,600000000.0,...,0,0,2,1,0,0,0,0,Normal,0
4,1e-05,udp,-,INT,2,0,2126,0,100000.0,850400000.0,...,0,0,2,1,0,0,0,0,Normal,0


In [3]:
df.duplicated().sum()

112451

In [4]:
unique_services = df['service'].unique()
unique_attack_cat = df['attack_cat'].unique()
unique_proto = df["proto"].unique()
unique_state = df["state"].unique()

categories = [
    ("Types of 'service' observations:", unique_services),
    ("Types of attack:", unique_attack_cat),
    ("Types of protocol:", unique_proto),
    ("Types of states:", unique_state)
]
[print(f"{title}\n{', '.join(values)}\n") for title, values in categories]

Types of 'service' observations:
-, http, ftp, ftp-data, smtp, pop3, dns, snmp, ssl, dhcp, irc, radius, ssh

Types of attack:
Normal, Reconnaissance, Backdoor, DoS, Exploits, Analysis, Fuzzers, Worms, Shellcode, Generic

Types of protocol:
udp, arp, tcp, igmp, ospf, sctp, gre, ggp, ip, ipnip, st2, argus, chaos, egp, emcon, nvp, pup, xnet, mux, dcn, hmp, prm, trunk-1, trunk-2, xns-idp, leaf-1, leaf-2, irtp, rdp, netblt, mfe-nsp, merit-inp, 3pc, idpr, ddp, idpr-cmtp, tp++, ipv6, sdrp, ipv6-frag, ipv6-route, idrp, mhrp, i-nlsp, rvd, mobile, narp, skip, tlsp, ipv6-no, any, ipv6-opts, cftp, sat-expak, ippc, kryptolan, sat-mon, cpnx, wsn, pvp, br-sat-mon, sun-nd, wb-mon, vmtp, ttp, vines, nsfnet-igp, dgp, eigrp, tcf, sprite-rpc, larp, mtp, ax.25, ipip, aes-sp3-d, micp, encap, pri-enc, gmtp, ifmp, pnni, qnx, scps, cbt, bbn-rcc, igp, bna, swipe, visa, ipcv, cphb, iso-tp4, wb-expak, sep, secure-vmtp, xtp, il, rsvp, unas, fc, iso-ip, etherip, pim, aris, a/n, ipcomp, snp, compaq-peer, ipx-n-ip, p

[None, None, None, None]

In [5]:
df = df[df['service'] != '-']

In [6]:
df1=df.copy() # To be used later

#drop irrelevant columns, hence id and the target column attack_category
drop_column = ["attack_cat"]
df.drop(drop_column, axis=1, inplace=True)

In [7]:
df_categorical = df.select_dtypes(exclude=[np.number])
df_categorical

Unnamed: 0,proto,service,state
35,tcp,http,FIN
40,tcp,http,FIN
45,tcp,http,FIN
49,tcp,http,FIN
72,tcp,http,FIN
...,...,...,...
257667,udp,dns,INT
257668,udp,dns,INT
257670,udp,dns,INT
257671,udp,dns,INT


In [8]:
proto_unique = df['proto'].unique()
service_unique = df['service'].unique()
state_unique = df['state'].unique()

print("Unique 'proto' values:", proto_unique)
print("Unique 'service' values:", service_unique)
print("Unique 'state' values:", state_unique)

Unique 'proto' values: ['tcp' 'udp']
Unique 'service' values: ['http', 'ftp', 'ftp-data', 'smtp', 'pop3', ..., 'ssl', 'dhcp', 'irc', 'radius', 'ssh']
Length: 12
Categories (13, object): ['-', 'dhcp', 'dns', 'ftp', ..., 'smtp', 'snmp', 'ssh', 'ssl']
Unique 'state' values: ['FIN' 'INT' 'CON' 'ACC' 'REQ' 'RST']


In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Sample DataFrame



# Initialize LabelEncoder
label1 = LabelEncoder()

# Fit the LabelEncoder with unique values from the 'proto' column
df['proto']=label1.fit_transform(df['proto'])

# Now you can transform a single value or a list of values
value_to_transform = ['udp']  # This should be a list
encoded_value = label1.transform(value_to_transform)

print(f"Encoded value for 'udp': {encoded_value[0]}")


Encoded value for 'udp': 1


In [10]:
label2 = LabelEncoder()

df['service']=label2.fit_transform(df['service'])

val1 = ['http']
ev = label2.transform(val1)
print(ev)

[4]


In [11]:
label3 =  LabelEncoder()
df['state']=label3.fit_transform(df['state'])
val2 = ['INT']
ev1 = label3.transform(val2)
print(ev1)

[3]


In [12]:
scaler = StandardScaler()
numerical_columns = df.select_dtypes(include=[np.number]).columns
numerical_columns

Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt',
       'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt',
       'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports',
       'label'],
      dtype='object')

In [13]:
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
df[numerical_columns]

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,label
35,0.045627,-1.203937,0.720069,-0.620894,-0.054516,-0.069046,-0.052843,-0.071704,-0.632651,-0.617410,...,0.116395,0.747477,-0.056250,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0,-1.731059
40,0.182081,-1.203937,0.720069,-0.620894,-0.054516,-0.054114,-0.052801,-0.071172,-0.632680,-0.617437,...,0.035370,0.747477,-0.056210,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0,-1.731059
45,0.064308,-1.203937,0.720069,-0.620894,-0.054516,-0.069046,-0.052784,-0.071918,-0.632658,-0.617414,...,0.096138,0.747477,-0.056509,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0,-1.731059
49,0.047279,-1.203937,0.720069,-0.620894,-0.054516,-0.054114,-0.052894,-0.070336,-0.632639,-0.617412,...,0.092087,0.747477,-0.055202,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0,-1.731059
72,0.124732,-1.203937,0.720069,-0.620894,-0.043517,-0.069046,-0.052498,-0.071997,-0.632667,-0.617422,...,0.088036,0.747477,-0.056618,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0,-1.731059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257667,-0.197858,0.830608,-0.669756,0.898620,-0.098512,-0.128774,-0.055798,-0.078325,0.356981,0.338043,...,-0.479140,-0.211689,-0.058756,2.244145,1.337896,-0.16555,-0.165479,-0.288988,0.0,0.577681
257668,-0.197857,0.830608,-0.669756,0.898620,-0.098512,-0.128774,-0.055798,-0.078325,0.027070,0.019533,...,-0.479140,-0.211689,-0.058756,1.396394,0.797006,-0.16555,-0.165479,-0.288988,0.0,0.577681
257670,-0.197857,0.830608,-0.669756,0.898620,-0.098512,-0.128774,-0.055798,-0.078325,0.027070,0.019533,...,-0.479140,-0.211689,-0.058756,-0.581691,-0.555217,-0.16555,-0.165479,-0.288988,0.0,0.577681
257671,-0.197857,0.830608,-0.669756,0.898620,-0.098512,-0.128774,-0.055798,-0.078325,0.027070,0.019533,...,-0.479140,-0.211689,-0.058756,1.961562,0.932229,-0.16555,-0.165479,-0.288988,0.0,0.577681


In [14]:
# Sample values for each feature
sample_data = {
    'dur': 15.0,                    # Example duration
    'proto': 1,                     # Corresponding to 'tcp' (for example)
    'service': 4,                   # Corresponding to 'http'
    'state': 2,                     # Corresponding to 'REQ'
    'spkts': 5,                     # Number of packets
    'dpkts': 3,                     # Number of dropped packets
    'sbytes': 1000,                 # Sent bytes
    'dbytes': 1500,                 # Received bytes
    'rate': 1.5,                    # Some rate value
    'sload': 0.5,                   # Sender load
    'dload': 0.4,                   # Receiver load
    'sloss': 0.1,                   # Sender loss
    'dloss': 0.05,                  # Receiver loss
    'sinpkt': 0.2,                  # Sender inter-packet time
    'dinpkt': 0.1,                  # Receiver inter-packet time
    'sjit': 0.01,                   # Sender jitter
    'djit': 0.02,                   # Receiver jitter
    'swin': 1000,                   # Sender window size
    'stcpb': 100,                   # Sender TCP buffer size
    'dtcpb': 200,                   # Receiver TCP buffer size
    'dwin': 2000,                   # Receiver window size
    'tcprtt': 0.2,                  # TCP round-trip time
    'synack': 0.15,                 # SYN-ACK time
    'ackdat': 0.1,                  # ACK data time
    'smean': 50,                    # Sender mean
    'dmean': 60,                    # Receiver mean
    'trans_depth': 1,               # Transaction depth
    'response_body_len': 500,       # Length of response body
    'ct_src_dport_ltm': 10,         # Count of source ports
    'ct_dst_sport_ltm': 5,          # Count of destination ports
    'is_ftp_login': 1,              # Is FTP login (1 for yes, 0 for no)
    'ct_ftp_cmd': 2,                # Count of FTP commands
    'ct_flw_http_mthd': 3,          # Count of HTTP methods
    'is_sm_ips_ports': 1,           # Is it a small number of IPs and ports (1 for yes, 0 for no)
    'label': 0  
}

# Now, you can convert this dictionary into a DataFrame for scaling or further processing
import pandas as pd

# Convert dictionary to DataFrame
test_data_df = pd.DataFrame([sample_data])
test_data_df


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,label
0,15.0,1,4,2,5,3,1000,1500,1.5,0.5,...,60,1,500,10,5,1,2,3,1,0


In [15]:
val1 = scaler.transform(test_data_df)
val1

array([[ 3.51430059,  0.83060827,  0.72006864, -0.62089422, -0.0820136 ,
        -0.10637609, -0.05206861, -0.06985036, -0.63274424, -0.61748534,
        -0.21585693, -0.0690673 , -0.09699769, -0.16783883, -0.1463623 ,
        -0.12475491, -0.17112272,  7.1480005 , -0.66436093, -0.66470763,
        15.12667613,  2.43619555,  3.73447705,  2.35666642, -0.36267486,
        -0.23606433,  0.7474772 , -0.05194788,  0.07767051, -0.284772  ,
         5.66615408, 11.46475561,  2.752291  ,  1.        , -1.73105896]])

In [16]:
first_row = df1.iloc[0]
first_row
first_row_dict = first_row.to_dict()

# Print the first row's values with keys as column names
print(first_row_dict)


{'dur': 0.9838740229606628, 'proto': 'tcp', 'service': 'http', 'state': 'FIN', 'spkts': 10, 'dpkts': 8, 'sbytes': 816, 'dbytes': 1172, 'rate': 17.278635025024414, 'sload': 5976.375, 'dload': 8342.53125, 'sloss': 2, 'dloss': 2, 'sinpkt': 109.3193359375, 'dinpkt': 124.932861328125, 'sjit': 5929.2119140625, 'djit': 192.5904083251953, 'swin': 255, 'stcpb': 794167371, 'dtcpb': 1624757001, 'dwin': 255, 'tcprtt': 0.20657199621200562, 'synack': 0.10839299857616425, 'ackdat': 0.09817899763584137, 'smean': 82, 'dmean': 147, 'trans_depth': 1, 'response_body_len': 184, 'ct_src_dport_ltm': 1, 'ct_dst_sport_ltm': 1, 'is_ftp_login': 0, 'ct_ftp_cmd': 0, 'ct_flw_http_mthd': 1, 'is_sm_ips_ports': 0, 'attack_cat': 'Normal', 'label': 0}


In [17]:

first_row_dict['proto'] = label1.transform([first_row_dict['proto']])[0]
first_row_dict['proto'] 

0

In [18]:
first_row_dict['service'] = label2.transform([first_row_dict['service']])[0]
first_row_dict['service']

4

In [19]:
first_row_dict['state'] = label3.transform([first_row_dict['state']])[0]
first_row_dict['state']

2

In [20]:
dftest = pd.DataFrame([first_row_dict])


dftest = dftest.drop(['attack_cat'],axis=1)
dftest


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,label
0,0.983874,0,4,2,10,8,816,1172,17.278635,5976.375,...,147,1,184,1,1,0,0,1,0,0


In [21]:
val2 = scaler.transform(dftest)
val2

array([[ 0.04562684, -1.20393697,  0.72006864, -0.62089422, -0.05451577,
        -0.06904615, -0.05284302, -0.07170354, -0.63265054, -0.61741021,
        -0.20723095, -0.04780724, -0.06702521,  0.27936421,  0.30179019,
         0.12957628, -0.12761145,  1.20393697, -0.06447789,  0.56906837,
         1.20395838,  2.5342373 ,  2.55761558,  2.30386434, -0.20930557,
         0.11639475,  0.7474772 , -0.05625037, -0.77008046, -0.82566121,
        -0.1655504 , -0.16547936,  0.72477136,  0.        , -1.73105896]])

In [22]:
df

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,label
35,0.045627,-1.203937,0.720069,-0.620894,-0.054516,-0.069046,-0.052843,-0.071704,-0.632651,-0.617410,...,0.116395,0.747477,-0.056250,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0,-1.731059
40,0.182081,-1.203937,0.720069,-0.620894,-0.054516,-0.054114,-0.052801,-0.071172,-0.632680,-0.617437,...,0.035370,0.747477,-0.056210,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0,-1.731059
45,0.064308,-1.203937,0.720069,-0.620894,-0.054516,-0.069046,-0.052784,-0.071918,-0.632658,-0.617414,...,0.096138,0.747477,-0.056509,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0,-1.731059
49,0.047279,-1.203937,0.720069,-0.620894,-0.054516,-0.054114,-0.052894,-0.070336,-0.632639,-0.617412,...,0.092087,0.747477,-0.055202,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0,-1.731059
72,0.124732,-1.203937,0.720069,-0.620894,-0.043517,-0.069046,-0.052498,-0.071997,-0.632667,-0.617422,...,0.088036,0.747477,-0.056618,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0,-1.731059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257667,-0.197858,0.830608,-0.669756,0.898620,-0.098512,-0.128774,-0.055798,-0.078325,0.356981,0.338043,...,-0.479140,-0.211689,-0.058756,2.244145,1.337896,-0.16555,-0.165479,-0.288988,0.0,0.577681
257668,-0.197857,0.830608,-0.669756,0.898620,-0.098512,-0.128774,-0.055798,-0.078325,0.027070,0.019533,...,-0.479140,-0.211689,-0.058756,1.396394,0.797006,-0.16555,-0.165479,-0.288988,0.0,0.577681
257670,-0.197857,0.830608,-0.669756,0.898620,-0.098512,-0.128774,-0.055798,-0.078325,0.027070,0.019533,...,-0.479140,-0.211689,-0.058756,-0.581691,-0.555217,-0.16555,-0.165479,-0.288988,0.0,0.577681
257671,-0.197857,0.830608,-0.669756,0.898620,-0.098512,-0.128774,-0.055798,-0.078325,0.027070,0.019533,...,-0.479140,-0.211689,-0.058756,1.961562,0.932229,-0.16555,-0.165479,-0.288988,0.0,0.577681


In [140]:

train_df = pd.read_parquet('UNSW_NB15_training-set.parquet')
test_df = pd.read_parquet('UNSW_NB15_testing-set.parquet')

# Concatenate the training and test sets
df2 = pd.concat([train_df, test_df], ignore_index=True)

df2.info()

df2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257673 entries, 0 to 257672
Data columns (total 36 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   dur                257673 non-null  float32 
 1   proto              257673 non-null  object  
 2   service            257673 non-null  category
 3   state              257673 non-null  object  
 4   spkts              257673 non-null  int16   
 5   dpkts              257673 non-null  int16   
 6   sbytes             257673 non-null  int32   
 7   dbytes             257673 non-null  int32   
 8   rate               257673 non-null  float32 
 9   sload              257673 non-null  float32 
 10  dload              257673 non-null  float32 
 11  sloss              257673 non-null  int16   
 12  dloss              257673 non-null  int16   
 13  sinpkt             257673 non-null  float32 
 14  dinpkt             257673 non-null  float32 
 15  sjit               257673 non-null

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,1.1e-05,udp,-,INT,2,0,496,0,90909.09375,180363632.0,...,0,0,1,1,0,0,0,0,Normal,0
1,8e-06,udp,-,INT,2,0,1762,0,125000.0,881000000.0,...,0,0,1,1,0,0,0,0,Normal,0
2,5e-06,udp,-,INT,2,0,1068,0,200000.0,854400000.0,...,0,0,1,1,0,0,0,0,Normal,0
3,6e-06,udp,-,INT,2,0,900,0,166666.65625,600000000.0,...,0,0,2,1,0,0,0,0,Normal,0
4,1e-05,udp,-,INT,2,0,2126,0,100000.0,850400000.0,...,0,0,2,1,0,0,0,0,Normal,0


In [141]:
df2 = df2[df2['service']!='-']
df2

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
35,0.983874,tcp,http,FIN,10,8,816,1172,17.278635,5.976375e+03,...,1,184,1,1,0,0,1,0,Normal,0
40,1.535254,tcp,http,FIN,10,10,826,1266,12.375802,3.876883e+03,...,1,187,1,1,0,0,1,0,Normal,0
45,1.059359,tcp,http,FIN,10,8,830,1134,16.047441,5.641147e+03,...,1,165,1,1,0,0,1,0,Normal,0
49,0.990548,tcp,http,FIN,10,10,804,1414,19.181301,5.847268e+03,...,1,261,1,1,0,0,1,0,Normal,0
72,1.303518,tcp,http,FIN,12,8,898,1120,14.575939,5.057084e+03,...,1,157,1,1,0,0,1,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257667,0.000006,udp,dns,INT,2,0,114,0,166666.656250,7.600000e+07,...,0,0,33,17,0,0,0,0,Generic,1
257668,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,0,0,24,13,0,0,0,0,Generic,1
257670,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,0,0,3,3,0,0,0,0,Generic,1
257671,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,0,0,30,14,0,0,0,0,Generic,1


In [142]:
df2 = pd.get_dummies(df2, columns=['proto', 'service', 'state'])

In [143]:
df2

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sload,dload,sloss,dloss,...,service_smtp,service_snmp,service_ssh,service_ssl,state_ACC,state_CON,state_FIN,state_INT,state_REQ,state_RST
35,0.983874,10,8,816,1172,17.278635,5.976375e+03,8342.531250,2,2,...,False,False,False,False,False,False,True,False,False,False
40,1.535254,10,10,826,1266,12.375802,3.876883e+03,5940.385254,2,2,...,False,False,False,False,False,False,True,False,False,False
45,1.059359,10,8,830,1134,16.047441,5.641147e+03,7498.874512,2,2,...,False,False,False,False,False,False,True,False,False,False
49,0.990548,10,10,804,1414,19.181301,5.847268e+03,10281.177734,2,2,...,False,False,False,False,False,False,True,False,False,False
72,1.303518,12,8,898,1120,14.575939,5.057084e+03,6014.492676,2,2,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257667,0.000006,2,0,114,0,166666.656250,7.600000e+07,0.000000,0,0,...,False,False,False,False,False,False,False,True,False,False
257668,0.000009,2,0,114,0,111111.109375,5.066666e+07,0.000000,0,0,...,False,False,False,False,False,False,False,True,False,False
257670,0.000009,2,0,114,0,111111.109375,5.066666e+07,0.000000,0,0,...,False,False,False,False,False,False,False,True,False,False
257671,0.000009,2,0,114,0,111111.109375,5.066666e+07,0.000000,0,0,...,False,False,False,False,False,False,False,True,False,False


In [144]:
dftrain = df2.drop(['label','attack_cat'],axis=1)

In [145]:
dftrain

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sload,dload,sloss,dloss,...,service_smtp,service_snmp,service_ssh,service_ssl,state_ACC,state_CON,state_FIN,state_INT,state_REQ,state_RST
35,0.983874,10,8,816,1172,17.278635,5.976375e+03,8342.531250,2,2,...,False,False,False,False,False,False,True,False,False,False
40,1.535254,10,10,826,1266,12.375802,3.876883e+03,5940.385254,2,2,...,False,False,False,False,False,False,True,False,False,False
45,1.059359,10,8,830,1134,16.047441,5.641147e+03,7498.874512,2,2,...,False,False,False,False,False,False,True,False,False,False
49,0.990548,10,10,804,1414,19.181301,5.847268e+03,10281.177734,2,2,...,False,False,False,False,False,False,True,False,False,False
72,1.303518,12,8,898,1120,14.575939,5.057084e+03,6014.492676,2,2,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257667,0.000006,2,0,114,0,166666.656250,7.600000e+07,0.000000,0,0,...,False,False,False,False,False,False,False,True,False,False
257668,0.000009,2,0,114,0,111111.109375,5.066666e+07,0.000000,0,0,...,False,False,False,False,False,False,False,True,False,False
257670,0.000009,2,0,114,0,111111.109375,5.066666e+07,0.000000,0,0,...,False,False,False,False,False,False,False,True,False,False
257671,0.000009,2,0,114,0,111111.109375,5.066666e+07,0.000000,0,0,...,False,False,False,False,False,False,False,True,False,False


In [29]:
scaler2 = StandardScaler()
numerical_columns = dftrain.select_dtypes(include=[np.number]).columns

dftrain[numerical_columns] = scaler2.fit_transform(dftrain[numerical_columns])

dftrain

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports
35,0.045627,-1.203937,0.720069,-0.620894,-0.054516,-0.069046,-0.052843,-0.071704,-0.632651,-0.617410,...,-0.209306,0.116395,0.747477,-0.056250,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0
40,0.182081,-1.203937,0.720069,-0.620894,-0.054516,-0.054114,-0.052801,-0.071172,-0.632680,-0.617437,...,-0.204513,0.035370,0.747477,-0.056210,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0
45,0.064308,-1.203937,0.720069,-0.620894,-0.054516,-0.069046,-0.052784,-0.071918,-0.632658,-0.617414,...,-0.204513,0.096138,0.747477,-0.056509,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0
49,0.047279,-1.203937,0.720069,-0.620894,-0.054516,-0.054114,-0.052894,-0.070336,-0.632639,-0.617412,...,-0.218891,0.092087,0.747477,-0.055202,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0
72,0.124732,-1.203937,0.720069,-0.620894,-0.043517,-0.069046,-0.052498,-0.071997,-0.632667,-0.617422,...,-0.242855,0.088036,0.747477,-0.056618,-0.770080,-0.825661,-0.16555,-0.165479,0.724771,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257667,-0.197858,0.830608,-0.669756,0.898620,-0.098512,-0.128774,-0.055798,-0.078325,0.356981,0.338043,...,-0.329125,-0.479140,-0.211689,-0.058756,2.244145,1.337896,-0.16555,-0.165479,-0.288988,0.0
257668,-0.197857,0.830608,-0.669756,0.898620,-0.098512,-0.128774,-0.055798,-0.078325,0.027070,0.019533,...,-0.329125,-0.479140,-0.211689,-0.058756,1.396394,0.797006,-0.16555,-0.165479,-0.288988,0.0
257670,-0.197857,0.830608,-0.669756,0.898620,-0.098512,-0.128774,-0.055798,-0.078325,0.027070,0.019533,...,-0.329125,-0.479140,-0.211689,-0.058756,-0.581691,-0.555217,-0.16555,-0.165479,-0.288988,0.0
257671,-0.197857,0.830608,-0.669756,0.898620,-0.098512,-0.128774,-0.055798,-0.078325,0.027070,0.019533,...,-0.329125,-0.479140,-0.211689,-0.058756,1.961562,0.932229,-0.16555,-0.165479,-0.288988,0.0


In [30]:
train_df = pd.read_parquet('UNSW_NB15_training-set.parquet')
test_df = pd.read_parquet('UNSW_NB15_testing-set.parquet')

# Concatenate the training and test sets
df1 = pd.concat([train_df, test_df], ignore_index=True)

df1.info()

df1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257673 entries, 0 to 257672
Data columns (total 36 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   dur                257673 non-null  float32 
 1   proto              257673 non-null  object  
 2   service            257673 non-null  category
 3   state              257673 non-null  object  
 4   spkts              257673 non-null  int16   
 5   dpkts              257673 non-null  int16   
 6   sbytes             257673 non-null  int32   
 7   dbytes             257673 non-null  int32   
 8   rate               257673 non-null  float32 
 9   sload              257673 non-null  float32 
 10  dload              257673 non-null  float32 
 11  sloss              257673 non-null  int16   
 12  dloss              257673 non-null  int16   
 13  sinpkt             257673 non-null  float32 
 14  dinpkt             257673 non-null  float32 
 15  sjit               257673 non-null

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,1.1e-05,udp,-,INT,2,0,496,0,90909.09375,180363632.0,...,0,0,1,1,0,0,0,0,Normal,0
1,8e-06,udp,-,INT,2,0,1762,0,125000.0,881000000.0,...,0,0,1,1,0,0,0,0,Normal,0
2,5e-06,udp,-,INT,2,0,1068,0,200000.0,854400000.0,...,0,0,1,1,0,0,0,0,Normal,0
3,6e-06,udp,-,INT,2,0,900,0,166666.65625,600000000.0,...,0,0,2,1,0,0,0,0,Normal,0
4,1e-05,udp,-,INT,2,0,2126,0,100000.0,850400000.0,...,0,0,2,1,0,0,0,0,Normal,0


In [31]:
df1 = df1[df1['service']!= '-']
df1

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
35,0.983874,tcp,http,FIN,10,8,816,1172,17.278635,5.976375e+03,...,1,184,1,1,0,0,1,0,Normal,0
40,1.535254,tcp,http,FIN,10,10,826,1266,12.375802,3.876883e+03,...,1,187,1,1,0,0,1,0,Normal,0
45,1.059359,tcp,http,FIN,10,8,830,1134,16.047441,5.641147e+03,...,1,165,1,1,0,0,1,0,Normal,0
49,0.990548,tcp,http,FIN,10,10,804,1414,19.181301,5.847268e+03,...,1,261,1,1,0,0,1,0,Normal,0
72,1.303518,tcp,http,FIN,12,8,898,1120,14.575939,5.057084e+03,...,1,157,1,1,0,0,1,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257667,0.000006,udp,dns,INT,2,0,114,0,166666.656250,7.600000e+07,...,0,0,33,17,0,0,0,0,Generic,1
257668,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,0,0,24,13,0,0,0,0,Generic,1
257670,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,0,0,3,3,0,0,0,0,Generic,1
257671,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,0,0,30,14,0,0,0,0,Generic,1


In [32]:
df1['proto'] = label1.transform(df1['proto'])
df1['service'] = label2.transform(df1['service'])
df1['state'] = label3.transform(df1['state'])
df1

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
35,0.983874,0,4,2,10,8,816,1172,17.278635,5.976375e+03,...,1,184,1,1,0,0,1,0,Normal,0
40,1.535254,0,4,2,10,10,826,1266,12.375802,3.876883e+03,...,1,187,1,1,0,0,1,0,Normal,0
45,1.059359,0,4,2,10,8,830,1134,16.047441,5.641147e+03,...,1,165,1,1,0,0,1,0,Normal,0
49,0.990548,0,4,2,10,10,804,1414,19.181301,5.847268e+03,...,1,261,1,1,0,0,1,0,Normal,0
72,1.303518,0,4,2,12,8,898,1120,14.575939,5.057084e+03,...,1,157,1,1,0,0,1,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257667,0.000006,1,1,3,2,0,114,0,166666.656250,7.600000e+07,...,0,0,33,17,0,0,0,0,Generic,1
257668,0.000009,1,1,3,2,0,114,0,111111.109375,5.066666e+07,...,0,0,24,13,0,0,0,0,Generic,1
257670,0.000009,1,1,3,2,0,114,0,111111.109375,5.066666e+07,...,0,0,3,3,0,0,0,0,Generic,1
257671,0.000009,1,1,3,2,0,114,0,111111.109375,5.066666e+07,...,0,0,30,14,0,0,0,0,Generic,1


In [33]:
X_1 = df1.drop(['attack_cat', "label"], axis=1)
y_1 = df1['attack_cat']

In [34]:
X_1_log = X_1.apply(lambda x: np.log(x + 1))
X_1_log

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports
35,0.685052,0.000000,1.609438,1.098612,2.397895,2.197225,6.705639,7.067320,2.905733,8.695737,...,4.418840,4.997212,0.693147,5.220356,0.693359,0.693359,0.0,0.0,0.693359,0.0
40,0.930294,0.000000,1.609438,1.098612,2.397895,2.397895,6.717805,7.144407,2.593447,8.263044,...,4.430817,4.852030,0.693147,5.236442,0.693359,0.693359,0.0,0.0,0.693359,0.0
45,0.722395,0.000000,1.609438,1.098612,2.397895,2.197225,6.722630,7.034388,2.836000,8.638020,...,4.430817,4.962845,0.693147,5.111988,0.693359,0.693359,0.0,0.0,0.693359,0.0
49,0.688410,0.000000,1.609438,1.098612,2.397895,2.397895,6.690842,7.254885,3.004756,8.673901,...,4.394449,4.955827,0.693147,5.568345,0.693359,0.693359,0.0,0.0,0.693359,0.0
72,0.834438,0.000000,1.609438,1.098612,2.564949,2.197225,6.801283,7.021976,2.745727,8.528743,...,4.330733,4.948760,0.693147,5.062595,0.693359,0.693359,0.0,0.0,0.693359,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257667,0.000006,0.693147,0.693147,1.386294,1.098612,0.000000,4.744932,0.000000,12.023757,18.146244,...,4.060443,0.000000,0.000000,0.000000,3.525391,2.890625,0.0,0.0,0.000000,0.0
257668,0.000009,0.693147,0.693147,1.386294,1.098612,0.000000,4.744932,0.000000,11.618295,17.740778,...,4.060443,0.000000,0.000000,0.000000,3.218750,2.638672,0.0,0.0,0.000000,0.0
257670,0.000009,0.693147,0.693147,1.386294,1.098612,0.000000,4.744932,0.000000,11.618295,17.740778,...,4.060443,0.000000,0.000000,0.000000,1.386719,1.386719,0.0,0.0,0.000000,0.0
257671,0.000009,0.693147,0.693147,1.386294,1.098612,0.000000,4.744932,0.000000,11.618295,17.740778,...,4.060443,0.000000,0.000000,0.000000,3.433594,2.708984,0.0,0.0,0.000000,0.0


In [119]:
main_scale = StandardScaler()
X_scaled = main_scale.fit_transform(X_1_log)
X_scaled

array([[ 0.74919573, -1.20393697,  1.01046492, ..., -0.16856844,
         1.53353253,  0.        ],
       [ 1.21424254, -1.20393697,  1.01046492, ..., -0.16856844,
         1.53353253,  0.        ],
       [ 0.82000882, -1.20393697,  1.01046492, ..., -0.16856844,
         1.53353253,  0.        ],
       ...,
       [-0.54983339,  0.83060827, -0.7577497 , ..., -0.16856844,
        -0.47496573,  0.        ],
       [-0.54983339,  0.83060827, -0.7577497 , ..., -0.16856844,
        -0.47496573,  0.        ],
       [-0.54983339,  0.83060827, -0.7577497 , ..., -0.16856844,
        -0.47496573,  0.        ]])

In [36]:
X_train_full, X_test_1, y_train_full, y_test_1 = train_test_split(X_scaled, y_1, test_size=0.2, random_state=42)
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

In [37]:
X_train_full[0]

array([-0.54983519,  0.83060827, -0.7577497 ,  0.84710892, -0.70869571,
       -0.85650531, -0.69898826, -0.94344392,  0.85114207,  0.84544507,
       -0.96090207, -0.66894986, -0.65709664, -0.75759531, -0.75473008,
       -0.80826585, -0.73201307, -0.83060827, -0.82902483, -0.82907676,
       -0.83059349, -0.55802239, -0.51468682, -0.55325273, -0.4769915 ,
       -0.96385296, -0.51613818, -0.3893987 ,  0.30894298,  0.46780811,
       -0.1685644 , -0.16856844, -0.47496573,  0.        ])

In [38]:
label_encoder = LabelEncoder()
y_1_encoded = label_encoder.fit_transform(y_1)

In [39]:
X_train_full, X_test_1, y_train_full, y_test_1 = train_test_split(X_scaled, y_1_encoded, test_size=0.2, random_state=42)
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

In [40]:
# Initialize and train the XGBoost classifier
xgb_classifier_2 = XGBClassifier(objective='multi:softprob', num_class=len(np.unique(y_1_encoded)))

# Perform k-fold cross-validation
k = 10
cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
scores = cross_val_score(xgb_classifier_2, X_train_full, y_train_full, cv=cv, scoring='accuracy')

# Evaluate the model's cross-validation performance
mean_accuracy = scores.mean()
std_accuracy = scores.std()
print(f'Mean Accuracy: {mean_accuracy:.2f}')
print(f'Standard Deviation: {std_accuracy:.2f}')

# Train the final model
xgb_classifier_2.fit(X_train_1, y_train_1)

# Predict and evaluate on the validation set
y_val_pred_xgb = xgb_classifier_2.predict(X_val_1)
y_val_pred_decoded = label_encoder.inverse_transform(y_val_pred_xgb)
accuracy_val_xgb = accuracy_score(y_val_1, y_val_pred_xgb)
print("\nValidation Set Performance:")
print(f'Accuracy: {accuracy_val_xgb:.2f}')

# Predict and evaluate on the test set
y_pred_xgb = xgb_classifier_2.predict(X_test_1)
y_pred_decoded = label_encoder.inverse_transform(y_pred_xgb)
accuracy_xgb = accuracy_score(y_test_1, y_pred_xgb)

# Decode the true labels in the test set for a readable classification report
y_test_1_decoded = label_encoder.inverse_transform(y_test_1)

# Classification report using decoded labels
classification_report_str_xgb = classification_report(y_test_1_decoded, y_pred_decoded)
print("\nFinal Model Performance:")
print(f'Accuracy: {accuracy_xgb:.2f}')
print('Classification Report:')
print(classification_report_str_xgb)

Mean Accuracy: 0.94
Standard Deviation: 0.00

Validation Set Performance:
Accuracy: 0.94

Final Model Performance:
Accuracy: 0.94
Classification Report:
                precision    recall  f1-score   support

      Analysis       0.93      0.40      0.56       105
      Backdoor       1.00      0.09      0.17        22
           DoS       0.61      0.21      0.32       530
      Exploits       0.85      0.93      0.89      4375
       Fuzzers       0.70      0.70      0.70       425
       Generic       1.00      0.99      1.00     11506
        Normal       0.96      0.98      0.97      5821
Reconnaissance       0.60      0.53      0.56       463
         Worms       0.50      0.29      0.37        24

      accuracy                           0.94     23271
     macro avg       0.79      0.57      0.61     23271
  weighted avg       0.94      0.94      0.93     23271



In [41]:
dft = dftrain.iloc[0]

In [42]:
dft

dur                  0.045627
proto               -1.203937
service              0.720069
state               -0.620894
spkts               -0.054516
dpkts               -0.069046
sbytes              -0.052843
dbytes              -0.071704
rate                -0.632651
sload               -0.617410
dload               -0.207231
sloss               -0.047807
dloss               -0.067025
sinpkt               0.279364
dinpkt               0.301790
sjit                 0.129576
djit                -0.127611
swin                 1.203937
stcpb               -0.064478
dtcpb                0.569068
dwin                 1.203958
tcprtt               2.534237
synack               2.557616
ackdat               2.303864
smean               -0.209306
dmean                0.116395
trans_depth          0.747477
response_body_len   -0.056250
ct_src_dport_ltm    -0.770080
ct_dst_sport_ltm    -0.825661
is_ftp_login        -0.165550
ct_ftp_cmd          -0.165479
ct_flw_http_mthd     0.724771
is_sm_ips_

In [43]:
# Select the first row and convert to array



In [44]:
first_row_array = X_train_full[0]

In [45]:
first_row_array = first_row_array.reshape(1, -1)

val1 = xgb_classifier_2.predict(first_row_array)
val1 = label_encoder.inverse_transform(val1)
val1

array(['Generic'], dtype=object)

In [46]:
# first i applied label encoder1,2,3 then i applied log transformation on that data then scaler transformation to transform that data
# and put that type of data in model to predict and when we taking value in single then we have to apply reshape(1,-1) to get in shape

In [47]:
df1 = df1.drop(['attack_cat','label'],axis=1)
testing2 = df1.iloc[0]
testing2

dur                  9.838740e-01
proto                0.000000e+00
service              4.000000e+00
state                2.000000e+00
spkts                1.000000e+01
dpkts                8.000000e+00
sbytes               8.160000e+02
dbytes               1.172000e+03
rate                 1.727864e+01
sload                5.976375e+03
dload                8.342531e+03
sloss                2.000000e+00
dloss                2.000000e+00
sinpkt               1.093193e+02
dinpkt               1.249329e+02
sjit                 5.929212e+03
djit                 1.925904e+02
swin                 2.550000e+02
stcpb                7.941674e+08
dtcpb                1.624757e+09
dwin                 2.550000e+02
tcprtt               2.065720e-01
synack               1.083930e-01
ackdat               9.817900e-02
smean                8.200000e+01
dmean                1.470000e+02
trans_depth          1.000000e+00
response_body_len    1.840000e+02
ct_src_dport_ltm     1.000000e+00
ct_dst_sport_l

In [48]:
testing_log = testing2.apply(lambda x:np.log(x+1))
testing_log

dur                   0.685052
proto                 0.000000
service               1.609438
state                 1.098612
spkts                 2.397895
dpkts                 2.197225
sbytes                6.705639
dbytes                7.067320
rate                  2.905733
sload                 8.695737
dload                 9.029242
sloss                 1.098612
dloss                 1.098612
sinpkt                4.703379
dinpkt                4.835749
sjit                  8.687815
djit                  5.265745
swin                  5.545177
stcpb                20.492805
dtcpb                21.208624
dwin                  5.545177
tcprtt                0.187783
synack                0.102911
ackdat                0.093653
smean                 4.418841
dmean                 4.997212
trans_depth           0.693147
response_body_len     5.220356
ct_src_dport_ltm      0.693147
ct_dst_sport_ltm      0.693147
is_ftp_login          0.000000
ct_ftp_cmd            0.000000
ct_flw_h

In [49]:
# Assuming testing_log is a Pandas Series
# Convert the Series to a DataFrame with a single row
testing_log_df = pd.DataFrame(testing_log).T  # Transpose to make it a row

# Now transform using StandardScaler
x_val = main_scale.transform(testing_log_df)
x_val = x_val.reshape(1,-1)
x_val

array([[ 0.74919564, -1.20393697,  1.01046492, -0.49722005,  0.50535066,
         0.63485718,  0.41328813,  0.88690189, -1.37086198, -1.39631741,
         0.64337112,  0.4303808 ,  0.40021615,  1.54233598,  1.601285  ,
         1.38026826,  1.00039908,  1.20393696,  1.1357058 ,  1.20420276,
         1.20395838,  2.51484459,  2.55825644,  2.32389131,  0.03136299,
         0.99765843,  1.89805886,  1.64513339, -0.96622044, -0.92732152,
        -0.1685644 , -0.16856844,  1.53291785,  0.        ]])

In [50]:
val12=(xgb_classifier_2.predict(x_val))
print(label_encoder.inverse_transform(val12))

['Normal']


In [51]:
# label1=proto
# label2 = service
# label3 = state
# main_scaler = to scale
# label_encoder = to get inverse of attack category
import joblib

joblib.dump(label1,'proto_encoder.joblib')
joblib.dump(label2,'service_encoder.joblib')
joblib.dump(label3,'state_encoder.joblib')
joblib.dump(main_scale,'scaler.joblib')
joblib.dump(label_encoder,'inverse_encoder.joblib')
joblib.dump(xgb_classifier_2,'model1.joblib')

['model1.joblib']

In [52]:
testing_data = df1.iloc[:5000]
testing_data

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports
35,0.983874,0,4,2,10,8,816,1172,17.278635,5.976375e+03,...,82,147,1,184,1,1,0,0,1,0
40,1.535254,0,4,2,10,10,826,1266,12.375802,3.876883e+03,...,83,127,1,187,1,1,0,0,1,0
45,1.059359,0,4,2,10,8,830,1134,16.047441,5.641147e+03,...,83,142,1,165,1,1,0,0,1,0
49,0.990548,0,4,2,10,10,804,1414,19.181301,5.847268e+03,...,80,141,1,261,1,1,0,0,1,0
72,1.303518,0,4,2,12,8,898,1120,14.575939,5.057084e+03,...,75,140,1,157,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13243,0.000006,1,1,3,2,0,114,0,166666.656250,7.600000e+07,...,57,0,0,0,25,25,0,0,0,0
13244,0.000005,1,1,3,2,0,114,0,200000.000000,9.120000e+07,...,57,0,0,0,18,18,0,0,0,0
13245,0.000005,1,1,3,2,0,114,0,200000.000000,9.120000e+07,...,57,0,0,0,11,11,0,0,0,0
13246,0.000003,1,1,3,2,0,114,0,333333.312500,1.520000e+08,...,57,0,0,0,25,25,0,0,0,0


In [53]:
testing_data.to_excel('testing_data.xlsx', index=False)

In [54]:
xt = scaler.transform(testing_data)
sm = pd.DataFramee(xt)
sm

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- label


In [56]:
df1['attack_cat'=Normal]


SyntaxError: cannot assign to literal here. Maybe you meant '==' instead of '='? (764388210.py, line 1)

In [57]:
df_normal = df1[df1['attack_cat'] == 'Normal']
df_normal

KeyError: 'attack_cat'

In [146]:
train_df = pd.read_parquet('UNSW_NB15_training-set.parquet')
test_df = pd.read_parquet('UNSW_NB15_testing-set.parquet')

# Concatenate the training and test sets
df1 = pd.concat([train_df, test_df], ignore_index=True)

df1.info()

df1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257673 entries, 0 to 257672
Data columns (total 36 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   dur                257673 non-null  float32 
 1   proto              257673 non-null  object  
 2   service            257673 non-null  category
 3   state              257673 non-null  object  
 4   spkts              257673 non-null  int16   
 5   dpkts              257673 non-null  int16   
 6   sbytes             257673 non-null  int32   
 7   dbytes             257673 non-null  int32   
 8   rate               257673 non-null  float32 
 9   sload              257673 non-null  float32 
 10  dload              257673 non-null  float32 
 11  sloss              257673 non-null  int16   
 12  dloss              257673 non-null  int16   
 13  sinpkt             257673 non-null  float32 
 14  dinpkt             257673 non-null  float32 
 15  sjit               257673 non-null

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,1.1e-05,udp,-,INT,2,0,496,0,90909.09375,180363632.0,...,0,0,1,1,0,0,0,0,Normal,0
1,8e-06,udp,-,INT,2,0,1762,0,125000.0,881000000.0,...,0,0,1,1,0,0,0,0,Normal,0
2,5e-06,udp,-,INT,2,0,1068,0,200000.0,854400000.0,...,0,0,1,1,0,0,0,0,Normal,0
3,6e-06,udp,-,INT,2,0,900,0,166666.65625,600000000.0,...,0,0,2,1,0,0,0,0,Normal,0
4,1e-05,udp,-,INT,2,0,2126,0,100000.0,850400000.0,...,0,0,2,1,0,0,0,0,Normal,0


In [147]:
df1 = df1[df1['service']!='-']
df1

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
35,0.983874,tcp,http,FIN,10,8,816,1172,17.278635,5.976375e+03,...,1,184,1,1,0,0,1,0,Normal,0
40,1.535254,tcp,http,FIN,10,10,826,1266,12.375802,3.876883e+03,...,1,187,1,1,0,0,1,0,Normal,0
45,1.059359,tcp,http,FIN,10,8,830,1134,16.047441,5.641147e+03,...,1,165,1,1,0,0,1,0,Normal,0
49,0.990548,tcp,http,FIN,10,10,804,1414,19.181301,5.847268e+03,...,1,261,1,1,0,0,1,0,Normal,0
72,1.303518,tcp,http,FIN,12,8,898,1120,14.575939,5.057084e+03,...,1,157,1,1,0,0,1,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257667,0.000006,udp,dns,INT,2,0,114,0,166666.656250,7.600000e+07,...,0,0,33,17,0,0,0,0,Generic,1
257668,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,0,0,24,13,0,0,0,0,Generic,1
257670,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,0,0,3,3,0,0,0,0,Generic,1
257671,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,0,0,30,14,0,0,0,0,Generic,1


In [148]:
df_normal = df1[df1['attack_cat'] == 'Normal']
df_normal

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
35,0.983874,tcp,http,FIN,10,8,816,1172,17.278635,5976.375000,...,1,184,1,1,0,0,1,0,Normal,0
40,1.535254,tcp,http,FIN,10,10,826,1266,12.375802,3876.882812,...,1,187,1,1,0,0,1,0,Normal,0
45,1.059359,tcp,http,FIN,10,8,830,1134,16.047441,5641.147461,...,1,165,1,1,0,0,1,0,Normal,0
49,0.990548,tcp,http,FIN,10,10,804,1414,19.181301,5847.268066,...,1,261,1,1,0,0,1,0,Normal,0
72,1.303518,tcp,http,FIN,12,8,898,1120,14.575939,5057.083984,...,1,157,1,1,0,0,1,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189786,1.018870,tcp,pop3,FIN,24,48,1126,41112,69.685043,8479.982422,...,0,0,1,1,0,0,0,0,Normal,0
189790,2.562474,tcp,ftp,FIN,22,26,1166,1792,18.341648,3474.766846,...,0,0,1,1,1,1,0,0,Normal,0
189792,1.704022,tcp,ftp-data,FIN,8,8,364,1258,8.802703,1497.633179,...,0,0,1,1,0,0,0,0,Normal,0
189794,0.733776,tcp,http,FIN,10,6,952,268,20.442207,9343.451172,...,1,0,1,1,0,0,1,0,Normal,0


In [149]:
dfw = df1.drop(['attack_cat','label'], axis=1)

In [84]:
dfw

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports
35,0.983874,tcp,http,FIN,10,8,816,1172,17.278635,5.976375e+03,...,82,147,1,184,1,1,0,0,1,0
40,1.535254,tcp,http,FIN,10,10,826,1266,12.375802,3.876883e+03,...,83,127,1,187,1,1,0,0,1,0
45,1.059359,tcp,http,FIN,10,8,830,1134,16.047441,5.641147e+03,...,83,142,1,165,1,1,0,0,1,0
49,0.990548,tcp,http,FIN,10,10,804,1414,19.181301,5.847268e+03,...,80,141,1,261,1,1,0,0,1,0
72,1.303518,tcp,http,FIN,12,8,898,1120,14.575939,5.057084e+03,...,75,140,1,157,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257667,0.000006,udp,dns,INT,2,0,114,0,166666.656250,7.600000e+07,...,57,0,0,0,33,17,0,0,0,0
257668,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,57,0,0,0,24,13,0,0,0,0
257670,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,57,0,0,0,3,3,0,0,0,0
257671,0.000009,udp,dns,INT,2,0,114,0,111111.109375,5.066666e+07,...,57,0,0,0,30,14,0,0,0,0


In [150]:
dfq = dfw.iloc[:5000]
dfq = pd.DataFrame(dfq)

In [64]:
dfq.to_excel('testing12.xlsx', index=False)

In [88]:
dfq


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,smean,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports
35,0.983874,tcp,http,FIN,10,8,816,1172,17.278635,5.976375e+03,...,82,147,1,184,1,1,0,0,1,0
40,1.535254,tcp,http,FIN,10,10,826,1266,12.375802,3.876883e+03,...,83,127,1,187,1,1,0,0,1,0
45,1.059359,tcp,http,FIN,10,8,830,1134,16.047441,5.641147e+03,...,83,142,1,165,1,1,0,0,1,0
49,0.990548,tcp,http,FIN,10,10,804,1414,19.181301,5.847268e+03,...,80,141,1,261,1,1,0,0,1,0
72,1.303518,tcp,http,FIN,12,8,898,1120,14.575939,5.057084e+03,...,75,140,1,157,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13243,0.000006,udp,dns,INT,2,0,114,0,166666.656250,7.600000e+07,...,57,0,0,0,25,25,0,0,0,0
13244,0.000005,udp,dns,INT,2,0,114,0,200000.000000,9.120000e+07,...,57,0,0,0,18,18,0,0,0,0
13245,0.000005,udp,dns,INT,2,0,114,0,200000.000000,9.120000e+07,...,57,0,0,0,11,11,0,0,0,0
13246,0.000003,udp,dns,INT,2,0,114,0,333333.312500,1.520000e+08,...,57,0,0,0,25,25,0,0,0,0


In [151]:
dfq = pd.get_dummies(dfq, columns=['proto', 'service', 'state'])
dfq

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sload,dload,sloss,dloss,...,service_irc,service_pop3,service_radius,service_smtp,service_snmp,service_ssh,service_ssl,state_CON,state_FIN,state_INT
35,0.983874,10,8,816,1172,17.278635,5.976375e+03,8342.531250,2,2,...,False,False,False,False,False,False,False,False,True,False
40,1.535254,10,10,826,1266,12.375802,3.876883e+03,5940.385254,2,2,...,False,False,False,False,False,False,False,False,True,False
45,1.059359,10,8,830,1134,16.047441,5.641147e+03,7498.874512,2,2,...,False,False,False,False,False,False,False,False,True,False
49,0.990548,10,10,804,1414,19.181301,5.847268e+03,10281.177734,2,2,...,False,False,False,False,False,False,False,False,True,False
72,1.303518,12,8,898,1120,14.575939,5.057084e+03,6014.492676,2,2,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13243,0.000006,2,0,114,0,166666.656250,7.600000e+07,0.000000,0,0,...,False,False,False,False,False,False,False,False,False,True
13244,0.000005,2,0,114,0,200000.000000,9.120000e+07,0.000000,0,0,...,False,False,False,False,False,False,False,False,False,True
13245,0.000005,2,0,114,0,200000.000000,9.120000e+07,0.000000,0,0,...,False,False,False,False,False,False,False,False,False,True
13246,0.000003,2,0,114,0,333333.312500,1.520000e+08,0.000000,0,0,...,False,False,False,False,False,False,False,False,False,True


In [152]:
dft = dfq.apply(lambda x: np.log(x + 1))
dft

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sload,dload,sloss,dloss,...,service_irc,service_pop3,service_radius,service_smtp,service_snmp,service_ssh,service_ssl,state_CON,state_FIN,state_INT
35,0.685052,2.397895,2.197225,6.705639,7.067320,2.905733,8.695737,9.029242,1.098612,1.098612,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.000000
40,0.930294,2.397895,2.397895,6.717805,7.144407,2.593447,8.263044,8.689697,1.098612,1.098612,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.000000
45,0.722395,2.397895,2.197225,6.722630,7.034388,2.836000,8.638020,8.922642,1.098612,1.098612,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.000000
49,0.688410,2.397895,2.397895,6.690842,7.254885,3.004756,8.673901,9.238168,1.098612,1.098612,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.000000
72,0.834438,2.564949,2.197225,6.801283,7.021976,2.745727,8.528743,8.702093,1.098612,1.098612,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13243,0.000006,1.098612,0.000000,4.744932,0.000000,12.023757,18.146244,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.693147
13244,0.000005,1.098612,0.000000,4.744932,0.000000,12.206078,18.328566,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.693147
13245,0.000005,1.098612,0.000000,4.744932,0.000000,12.206078,18.328566,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.693147
13246,0.000003,1.098612,0.000000,4.744932,0.000000,12.716901,18.839392,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.693147


In [153]:
dfq.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 35 to 13247
Data columns (total 49 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   dur                5000 non-null   float32
 1   spkts              5000 non-null   int16  
 2   dpkts              5000 non-null   int16  
 3   sbytes             5000 non-null   int32  
 4   dbytes             5000 non-null   int32  
 5   rate               5000 non-null   float32
 6   sload              5000 non-null   float32
 7   dload              5000 non-null   float32
 8   sloss              5000 non-null   int16  
 9   dloss              5000 non-null   int16  
 10  sinpkt             5000 non-null   float32
 11  dinpkt             5000 non-null   float32
 12  sjit               5000 non-null   float32
 13  djit               5000 non-null   float32
 14  swin               5000 non-null   int16  
 15  stcpb              5000 non-null   int64  
 16  dtcpb              5000 non

In [155]:
dfq1 = main_scale.fit_transform(dfq)
dfq1

array([[-0.02863081, -0.08493833, -0.08464642, ..., -0.08980265,
         0.59305706, -0.58073742],
       [ 0.1348913 , -0.08493833, -0.07735396, ..., -0.08980265,
         0.59305706, -0.58073742],
       [-0.00624433, -0.08493833, -0.08464642, ..., -0.08980265,
         0.59305706, -0.58073742],
       ...,
       [-0.32041569, -0.10792803, -0.11381623, ..., -0.08980265,
        -1.68617838,  1.72194862],
       [-0.32041628, -0.10792803, -0.11381623, ..., -0.08980265,
        -1.68617838,  1.72194862],
       [-0.32041539, -0.10792803, -0.11381623, ..., -0.08980265,
        -1.68617838,  1.72194862]])

In [156]:
predict1 = xgb_classifier_2.predict(dfq)
vla = label_encoder.inverse_transform(predict1)


ValueError: Feature shape mismatch, expected: 34, got 49

In [137]:
unique_values, counts = np.unique(vla, return_counts=True)
print(dict(zip(unique_values, counts)))

{'Exploits': 2805, 'Fuzzers': 4, 'Generic': 2191}


In [100]:
model = joblib.load('labelmodel.joblib')

In [138]:
predict2 = model.predict(dfq)
unique_values, counts = np.unique(predict2, return_counts=True)
print(dict(zip(unique_values, counts)))

{1: 5000}




In [132]:
# Initialize and train a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform k-fold cross-validation
k = 10
cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
scores = cross_val_score(rf_classifier, X_train_full, y_train_full, cv=cv, scoring='accuracy')

# Evaluate the model's cross-validation performance
mean_accuracy = scores.mean()
std_accuracy = scores.std()
print(f'Mean Accuracy: {mean_accuracy:.2f}')
print(f'Standard Deviation: {std_accuracy:.2f}')

# Train the final model on the training set
rf_classifier.fit(X_train_1, y_train_1)

# Predict and evaluate on the validation set
y_val_pred = rf_classifier.predict(X_val_1)
accuracy_val = accuracy_score(y_val_1, y_val_pred)
print("\nValidation Set Performance:")
print(f'Accuracy: {accuracy_val:.2f}')

# Predict and evaluate on the test set
y_pred_1 = rf_classifier.predict(X_test_1)
accuracy_1 = accuracy_score(y_test_1, y_pred_1)
classification_report_str_1 = classification_report(y_test_1, y_pred_1)

# Final model performance and classification report
print("\nFinal Model Performance:")
print(f'Accuracy: {accuracy_1:.2f}')
print('Classification Report:')
print(classification_report_str_1)

KeyboardInterrupt: 

In [139]:
predict1 = rf_classifier.predict(dfq)
vla = label_encoder.inverse_transform(predict1)
unique_values, counts = np.unique(vla, return_counts=True)
print(dict(zip(unique_values, counts)))

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [111]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train a Logistic Regression model
logistic_regressor = LogisticRegression(max_iter=1000, random_state=42)

# Perform k-fold cross-validation
k = 10
cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
scores = cross_val_score(logistic_regressor, X_train_full, y_train_full, cv=cv, scoring='accuracy')

# Evaluate the model's cross-validation performance
mean_accuracy = scores.mean()
std_accuracy = scores.std()
print(f'Mean Accuracy: {mean_accuracy:.2f}')
print(f'Standard Deviation: {std_accuracy:.2f}')

# Train the final model on the training set
logistic_regressor.fit(X_train_1, y_train_1)

# Predict and evaluate on the validation set
y_val_pred = logistic_regressor.predict(X_val_1)
accuracy_val = accuracy_score(y_val_1, y_val_pred)
print("\nValidation Set Performance:")
print(f'Accuracy: {accuracy_val:.2f}')

# Predict and evaluate on the test set
y_pred_1 = logistic_regressor.predict(X_test_1)
accuracy_1 = accuracy_score(y_test_1, y_pred_1)
classification_report_str_1 = classification_report(y_test_1, y_pred_1)

# Final model performance and classification report
print("\nFinal Model Performance:")
print(f'Accuracy: {accuracy_1:.2f}')
print('Classification Report:')
print(classification_report_str_1)


Mean Accuracy: 0.89
Standard Deviation: 0.00

Validation Set Performance:
Accuracy: 0.89

Final Model Performance:
Accuracy: 0.89
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.18      0.29       105
           1       0.00      0.00      0.00        22
           2       0.17      0.00      0.00       530
           3       0.70      0.90      0.79      4375
           4       0.51      0.42      0.46       425
           5       0.99      0.99      0.99     11506
           6       0.91      0.87      0.89      5821
           7       0.36      0.19      0.25       463
           8       0.00      0.00      0.00        24

    accuracy                           0.89     23271
   macro avg       0.49      0.40      0.41     23271
weighted avg       0.87      0.89      0.88     23271



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [125]:
predict1 = logistic_regressor.predict(dfq1)
vla = label_encoder.inverse_transform(predict1)
unique_values, counts = np.unique(vla, return_counts=True)
print(dict(zip(unique_values, counts)))

{'Normal': 1288, 'Reconnaissance': 3218, 'Worms': 494}


In [135]:
joblib.dump(scaler2,'scaler2.joblib')

['scaler2.joblib']