In [64]:
! pip install numpy pandas matplotlib seaborn scikit-learn




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import numpy as np  
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import random

In [66]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [67]:
folder_path='archive/nsl-kdd'
file_path_20_percent =f'{folder_path}/KDDTrain+_20Percent.txt'    
file_path_full_training_set =f'{folder_path}/KDDTrain+.txt' 
file_path_test = f'{folder_path}/KDDTest+.txt' 

df = pd.read_csv(file_path_full_training_set)
test_df = pd.read_csv(file_path_test)



In [68]:
columns = (['duration'
,'protocol_type'
,'service'
,'flag'
,'src_bytes'
,'dst_bytes'
,'land'
,'wrong_fragment'
,'urgent'
,'hot'
,'num_failed_logins'
,'logged_in'
,'num_compromised'
,'root_shell'
,'su_attempted'
,'num_root'
,'num_file_creations'
,'num_shells'
,'num_access_files'
,'num_outbound_cmds'
,'is_host_login'
,'is_guest_login'
,'count'
,'srv_count'
,'serror_rate'
,'srv_serror_rate'
,'rerror_rate'
,'srv_rerror_rate'
,'same_srv_rate'
,'diff_srv_rate'
,'srv_diff_host_rate'
,'dst_host_count'
,'dst_host_srv_count'
,'dst_host_same_srv_rate'
,'dst_host_diff_srv_rate'
,'dst_host_same_src_port_rate'
,'dst_host_srv_diff_host_rate'
,'dst_host_serror_rate'
,'dst_host_srv_serror_rate'
,'dst_host_rerror_rate'
,'dst_host_srv_rerror_rate'
,'attack'
,'level'])

df.columns = columns
test_df.columns = columns

df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level
0,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
2,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
3,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21


In [69]:
is_attack = df.attack.map(lambda a: 0 if a == 'normal' else 1) 
test_attack = test_df.attack.map(lambda a: 0 if a == 'normal' else 1) 

df['attack_flag'] = is_attack 
test_df['attack_flag'] = test_attack 

df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level,attack_flag
0,0,udp,other,SF,146,0,0,0,0,0,...,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,0
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,1
2,0,tcp,http,SF,232,8153,0,0,0,0,...,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,0
3,0,tcp,http,SF,199,420,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,0
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21,1


In [70]:
np.shape(df) 

(125972, 44)

In [71]:
set(df['protocol_type']) 

{'icmp', 'tcp', 'udp'}

In [72]:
set(df['attack'])

{'back',
 'buffer_overflow',
 'ftp_write',
 'guess_passwd',
 'imap',
 'ipsweep',
 'land',
 'loadmodule',
 'multihop',
 'neptune',
 'nmap',
 'normal',
 'perl',
 'phf',
 'pod',
 'portsweep',
 'rootkit',
 'satan',
 'smurf',
 'spy',
 'teardrop',
 'warezclient',
 'warezmaster'}

In [73]:
set(df['service']) 


{'IRC',
 'X11',
 'Z39_50',
 'aol',
 'auth',
 'bgp',
 'courier',
 'csnet_ns',
 'ctf',
 'daytime',
 'discard',
 'domain',
 'domain_u',
 'echo',
 'eco_i',
 'ecr_i',
 'efs',
 'exec',
 'finger',
 'ftp',
 'ftp_data',
 'gopher',
 'harvest',
 'hostnames',
 'http',
 'http_2784',
 'http_443',
 'http_8001',
 'imap4',
 'iso_tsap',
 'klogin',
 'kshell',
 'ldap',
 'link',
 'login',
 'mtp',
 'name',
 'netbios_dgm',
 'netbios_ns',
 'netbios_ssn',
 'netstat',
 'nnsp',
 'nntp',
 'ntp_u',
 'other',
 'pm_dump',
 'pop_2',
 'pop_3',
 'printer',
 'private',
 'red_i',
 'remote_job',
 'rje',
 'shell',
 'smtp',
 'sql_net',
 'ssh',
 'sunrpc',
 'supdup',
 'systat',
 'telnet',
 'tftp_u',
 'tim_i',
 'time',
 'urh_i',
 'urp_i',
 'uucp',
 'uucp_path',
 'vmnet',
 'whois'}

In [74]:
dos_attacks = ['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm']
probe_attacks = ['ipsweep','mscan','nmap','portsweep','saint','satan']
privilege_attacks = ['buffer_overflow','loadmdoule','perl','ps','rootkit','sqlattack','xterm']
access_attacks = ['ftp_write','guess_passwd','http_tunnel','imap','multihop','named','phf','sendmail','snmpgetattack','snmpguess','spy','warezclient','warezmaster','xclock','xsnoop']

attack_labels = ['Normal','DoS','Probe','Privilege','Access']  

def attack_types(attack):
    if attack in dos_attacks:
        attack_type = 1
    elif attack in probe_attacks:
        attack_type = 2
    elif attack in privilege_attacks:
        attack_type = 3
    elif attack in access_attacks:
        attack_type = 4
    else:
        attack_type = 0
        
    return attack_type

attack_map = df.attack.apply(attack_types)  

df['attack_map'] = attack_map

test_attack_map = test_df.attack.apply(attack_types)
test_df['attack_map'] = test_attack_map

df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level,attack_flag,attack_map
0,0,udp,other,SF,146,0,0,0,0,0,...,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,0,0
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,1,1
2,0,tcp,http,SF,232,8153,0,0,0,0,...,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,0,0
3,0,tcp,http,SF,199,420,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,0,0
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21,1,1


In [75]:
features_to_encode = ['protocol_type', 'service', 'flag']
encoded = pd.get_dummies(df[features_to_encode]) 
test_encoded_base = pd.get_dummies(test_df[features_to_encode])

test_index = np.arange(len(test_df.index))
column_diffs = list(set(encoded.columns.values)-set(test_encoded_base.columns.values)) 

diff_df = pd.DataFrame(0, index=test_index, columns=column_diffs) 

column_order = encoded.columns.to_list()

test_encoded_temp = test_encoded_base.join(diff_df)

test_final = test_encoded_temp[column_order].fillna(0)

numeric_features = ['duration', 'src_bytes', 'dst_bytes']

to_fit = encoded.join(df[numeric_features])
test_set = test_final.join(test_df[numeric_features])



In [76]:
binary_y = df['attack_flag'] 
multi_y = df['attack_map']

test_binary_y = test_df['attack_flag'] #normal mi saldırı mı
test_multi_y = test_df['attack_map'] #saldırı tipi ne 

binary_train_X, binary_val_X, binary_train_y, binary_val_y = train_test_split(to_fit, binary_y, test_size=0.2)
multi_train_X, multi_val_X, multi_train_y, multi_val_y = train_test_split(to_fit, multi_y, test_size = 0.2)


In [77]:
binary_train_X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100777 entries, 5462 to 97790
Data columns (total 87 columns):
 #   Column               Non-Null Count   Dtype
---  ------               --------------   -----
 0   protocol_type_icmp   100777 non-null  bool 
 1   protocol_type_tcp    100777 non-null  bool 
 2   protocol_type_udp    100777 non-null  bool 
 3   service_IRC          100777 non-null  bool 
 4   service_X11          100777 non-null  bool 
 5   service_Z39_50       100777 non-null  bool 
 6   service_aol          100777 non-null  bool 
 7   service_auth         100777 non-null  bool 
 8   service_bgp          100777 non-null  bool 
 9   service_courier      100777 non-null  bool 
 10  service_csnet_ns     100777 non-null  bool 
 11  service_ctf          100777 non-null  bool 
 12  service_daytime      100777 non-null  bool 
 13  service_discard      100777 non-null  bool 
 14  service_domain       100777 non-null  bool 
 15  service_domain_u     100777 non-null  bool 
 16  servi

In [78]:
binary_train_X.sample(5) 

Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,duration,src_bytes,dst_bytes
115147,False,True,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,0,0,0
75176,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,0,105,145
91137,False,True,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,0,0,0
34780,False,True,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,0,0,0
32127,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,0,323,10840


In [79]:
from sklearn.ensemble import RandomForestClassifier


binary_model = RandomForestClassifier()
binary_model.fit(binary_train_X, binary_train_y)
binary_predictions = binary_model.predict(binary_val_X)

base_rf_score = accuracy_score(binary_predictions,binary_val_y)
base_rf_score

0.9930144870013892