### Importing the data
We import the data and mark their labels

In [1]:
import pandas as pd

df = pd.read_csv('data/UNSW-NB15_4.csv')

df.columns = ['srcip',    'sport',    'dstip',    'dsport',    'proto',    'state',    'dur',    'sbytes',    'dbytes',    'sttl',    'dttl',    'sloss',    'dloss',    'service',    'sload',    'dload',    'spkts',    'dpkts',    'swin',    'dwin',    'stcpb',    'dtcpb',    'smeansz',    'dmeansz',    'trans_depth',    'res_bdy_len',    'sjit',
                        'djit',    'Stime',    'Ltime',    'Sintpkt',    'Dintpkt',    'tcprtt',    'synack',    'ackdat',    'is_sm_ips_ports',    'ct_state_ttl',    'ct_flw_http_mthd',    'is_ftp_login',    'ct_ftp_cmd',    'ct_srv_src',    'ct_srv_dst',    'ct_dst_ltm',    'ct_src_ltm',    'ct_src_dport_ltm',    'ct_dst_sport_ltm',    'ct_dst_src_ltm',    'attack_cat',    'label']
df.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label
0,59.166.0.9,9685,149.171.126.2,80,tcp,FIN,5.864748,19410,1087890,31,...,,3,1,4,4,1,1,1,,0
1,59.166.0.2,1421,149.171.126.4,53,udp,CON,0.001391,146,178,31,...,,3,5,2,7,1,1,4,,0
2,59.166.0.2,21553,149.171.126.2,25,tcp,FIN,0.053948,37812,3380,31,...,,1,1,4,7,1,1,3,,0
3,59.166.0.8,45212,149.171.126.4,53,udp,CON,0.000953,146,178,31,...,,2,5,2,1,1,1,2,,0
4,59.166.0.0,59922,149.171.126.8,6881,tcp,FIN,8.633186,25056,1094788,31,...,,9,7,2,3,2,1,6,,0


In [2]:
df.shape

(440043, 49)

We remove the labels since we are applying unsupervised learning. <br>
We also take only few columns for our sample. Note, that this is based on domain expertise and literature review.

In [3]:
sample = df.sample(20_000)

columns = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'dur', 'sbytes', 'dbytes',
            'sttl', 'dttl', 'sloss', 'dloss', 'sload', 'dload', 'spkts', 'dpkts',
            'swin', 'dwin', 'sjit', 'djit', 'is_sm_ips_ports', 'ct_srv_dst']
sample = sample[columns]

In [4]:
sample.dtypes

srcip               object
sport                int64
dstip               object
dsport              object
proto               object
dur                float64
sbytes               int64
dbytes               int64
sttl                 int64
dttl                 int64
sloss                int64
dloss                int64
sload              float64
dload              float64
spkts                int64
dpkts                int64
swin                 int64
dwin                 int64
sjit               float64
djit               float64
is_sm_ips_ports      int64
ct_srv_dst           int64
dtype: object

In [5]:
sample = sample.loc[(sample['dsport'].astype('str').str.isnumeric())]

In [6]:
sample['dsport'].astype('str').astype('int')

361468       80
70058     30504
47474         0
238674      111
426498    19911
          ...  
300213        0
157190       53
420180    53740
134304       53
377192       53
Name: dsport, Length: 19997, dtype: int32

In [7]:
nominal_cols = ["srcip", "dstip", "proto"]
sample = pd.get_dummies(sample, columns=nominal_cols)

In [8]:
sample.shape

(19997, 189)

In [9]:
# Creating the model
from sklearn.ensemble import IsolationForest

model = IsolationForest(n_estimators=100, contamination=.02, random_state=42)

# Fitting the model
model.fit(sample)

# Predicting the anomalies
predictions = model.predict(sample)

# Adding the predictions to the original sample
sample['label'] = predictions



In [10]:
sample.head()

Unnamed: 0,sport,dsport,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,sload,...,proto_vmtp,proto_vrrp,proto_wb-expak,proto_wb-mon,proto_wsn,proto_xnet,proto_xns-idp,proto_xtp,proto_zero,label
361468,28405,80,1.102088,1684,10168,31,29,3,5,11353.0,...,0,0,0,0,0,0,0,0,0,1
70058,29269,30504,0.039952,2958,32374,31,29,7,18,580096.1,...,0,0,0,0,0,0,0,0,0,1
47474,0,0,9e-06,200,0,254,0,0,0,88888890.0,...,0,0,0,0,0,0,0,0,0,1
238674,65184,111,2.713431,564,354,254,252,2,1,1497.735,...,0,0,0,0,0,0,0,0,0,1
426498,63251,19911,0.034929,3614,44706,31,29,7,24,813994.1,...,0,0,0,0,0,0,0,0,0,1


In [11]:
len(sample.loc[sample['label'] == -1]), len(sample.loc[sample['label'] == 1])

(400, 19597)