# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import socket
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import os

In [24]:
directory_in_str = r"/home/sd-work/Code/Minor-Project/Data/2021/"
directory = os.fsencode(directory_in_str)

tmp = pd.read_csv(r"/home/sd-work/Code/Minor-Project/2022.06.14.csv")

df =  tmp.truncate( before = len(tmp) )

print(df)
    

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".csv"): 
         
        tmp = pd.read_csv(directory_in_str + filename)

        tmp = tmp.drop( tmp[ tmp["label"] == "outlier"].index)

        df = pd.concat( [df, tmp] )

        continue
    else:
        continue

print(len(df))

Empty DataFrame
Columns: [avg_ipt, bytes_in, bytes_out, dest_ip, dest_port, entropy, num_pkts_out, num_pkts_in, proto, src_ip, src_port, time_end, time_start, total_entropy, label, duration]
Index: []
34037856


In [25]:
df.to_csv("/home/sd-work/Code/Minor-Project/2021.csv", index = False)

# Import dataset

In [2]:
df = pd.read_csv(r"/home/sd-work/Code/Minor-Project/2021.csv")

X = df.drop(columns = "label", axis=1)
y = pd.DataFrame(df["label"])

irrelevant_labels = ["dest_ip", "src_ip", "time_end", "time_start"]

X = X.drop( columns = irrelevant_labels, axis = 1 )


In [3]:
print( y.value_counts())

label    
benign       24126259
malicious     9911597
dtype: int64


# Sampling the datset

In [4]:
rus = RandomUnderSampler(random_state=0)

X, y = rus.fit_resample(X, y)

In [5]:
print( y.value_counts())

label    
benign       9911597
malicious    9911597
dtype: int64


## Imputing missing values

In [6]:

sim_imputer = SimpleImputer(strategy="most_frequent")

X = pd.DataFrame( sim_imputer.fit_transform(X), columns = X.columns)
y = pd.DataFrame( sim_imputer.fit_transform(y), columns = y.columns)


In [7]:
print(len(X.iloc[ : , 0]))
print(X.isna().sum())

19823194
avg_ipt          0
bytes_in         0
bytes_out        0
dest_port        0
entropy          0
num_pkts_out     0
num_pkts_in      0
proto            0
src_port         0
total_entropy    0
duration         0
dtype: int64


In [9]:
X["proto"] = X["proto"].astype(int)
print(X["proto"].head())

0    6
1    6
2    6
3    6
4    6
Name: proto, dtype: int64


# Replacing protocol no with name

In [11]:
prefix = "IPPROTO_"

table = {num:name[len(prefix):] 
          for name,num in vars(socket).items()
            if name.startswith(prefix)}

print(table)



for i in range(len(X.iloc[ : , 7 ])):
    X.iloc[ i , 7] = (table[X.iloc[ i , 7]]).lower()


{0: 'HOPOPTS', 1: 'ICMP', 2: 'IGMP', 41: 'IPV6', 4: 'IPIP', 6: 'TCP', 8: 'EGP', 12: 'PUP', 17: 'UDP', 136: 'UDPLITE', 22: 'IDP', 29: 'TP', 43: 'ROUTING', 44: 'FRAGMENT', 46: 'RSVP', 47: 'GRE', 50: 'ESP', 51: 'AH', 58: 'ICMPV6', 59: 'NONE', 60: 'DSTOPTS', 103: 'PIM', 132: 'SCTP', 262: 'MPTCP', 255: 'RAW'}


In [22]:
print(X.iloc[:10 , :])


   Unnamed: 0  avg_ipt  bytes_in  bytes_out  dest_port   entropy  \
0    939136.0      0.0       0.0        0.0    33122.0  0.000000   
1    462582.0      0.0       0.0    13784.0     9200.0  1.934536   
2   1475874.0      0.0       0.0      330.0    39290.0  6.844364   
3    297891.0      0.0       0.0     2896.0     9200.0  5.027503   
4    631640.0      0.0       0.0     1412.0     9200.0  5.296583   
5    910737.0      0.0       0.0     5334.0     9200.0  3.950911   
6    270184.0      0.0       0.0        0.0    45506.0  0.000000   
7    810926.0     10.5     354.0      502.0     9200.0  6.457975   
8    362616.0      0.0       0.0     7240.0     9200.0  3.625452   
9     53947.0      0.0       0.0    14480.0     9200.0  2.042063   

   num_pkts_out num_pkts_in proto  src_port  total_entropy  duration  
0           1.0     hopopts     6    9200.0         0.0000  0.000000  
1          10.0     hopopts     6   44798.0     26665.6480  0.000133  
2           1.0     hopopts     6    9

# One hot encoding categorical variables

In [12]:
X = pd.concat((X,pd.get_dummies(X.proto)),1)

oth = [ 0 for i in range(X.shape[0]) ]

others = pd.DataFrame( oth , columns= ["other_proto"])

X = pd.concat((X, others), axis=1 )

print(X.head)

  X = pd.concat((X,pd.get_dummies(X.proto)),1)


<bound method NDFrame.head of              avg_ipt  bytes_in  bytes_out  dest_port   entropy  num_pkts_out  \
0           0.000000       0.0        0.0    33122.0  0.000000           1.0   
1           0.000000       0.0    13784.0     9200.0  1.934536          10.0   
2           0.000000       0.0      330.0    39290.0  6.844364           1.0   
3           0.000000       0.0     2896.0     9200.0  5.027503           2.0   
4           0.000000       0.0     1412.0     9200.0  5.296583           1.0   
...              ...       ...        ...        ...       ...           ...   
19823189  148.000000     270.0      191.0      445.0  4.574347           6.0   
19823190   69.750000     270.0      191.0      445.0  4.591700           6.0   
19823191   10.928571    1447.0     1276.0       22.0  6.877138          13.0   
19823192   71.500000     270.0      191.0      445.0  4.591700           6.0   
19823193   71.750000     270.0      191.0      445.0  4.591700           6.0   

         

# Label Encoding

In [13]:
le = LabelEncoder()


y = le.fit_transform(y)

y = pd.DataFrame( data = y, columns = ["label"] )

import pickle

with open('enc.pickle', 'wb') as file:
    pickle.dump(le, file, pickle.HIGHEST_PROTOCOL)

  y = column_or_1d(y, warn=True)


In [14]:
X = X.drop( columns = ["proto"],axis = 1 )


# Split the dataset

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# Standardization

In [16]:
mean = X_train.iloc[ : , : 10].mean()
std = X_train.iloc[ : , : 10].std()

X_train.iloc[ : , : 10] = ( X_train.iloc[ : , : 10] - mean ) / (std)
X_test.iloc[ : , : 10] = ( X_test.iloc[ : , : 10] - mean ) / (std)

In [20]:
print(X_train.describe())

            avg_ipt      bytes_in     bytes_out     dest_port       entropy  \
count  1.585856e+07  1.585856e+07  1.585856e+07  1.585856e+07  1.585856e+07   
mean  -3.066097e-17  1.050364e-17 -3.760933e-18 -4.335983e-17 -1.182615e-16   
std    1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00   
min   -3.009640e-02 -1.989271e-01 -4.862600e-01 -7.567246e-01 -1.299397e+00   
25%   -3.009640e-02 -1.989271e-01 -4.862600e-01 -7.315256e-01 -1.299397e+00   
50%   -3.009640e-02 -1.989271e-01 -4.549175e-01 -2.346416e-01  2.123823e-01   
75%   -3.009615e-02 -1.208331e-01 -1.103612e-02 -2.346416e-01  6.709507e-01   
max    9.460455e+01  2.807410e+01  1.026781e+01  2.962613e+00  5.492875e+01   

       num_pkts_out   num_pkts_in      src_port  total_entropy      duration  \
count  1.585856e+07  1.585856e+07  1.585856e+07   1.585856e+07  1.585856e+07   
mean  -1.648824e-19 -2.712809e-17  3.069143e-18   3.165510e-16 -5.946857e-18   
std    1.000000e+00  1.000000e+00  1.000000e+00 

In [24]:
print(std.astype(float))

avg_ipt          4.538472e+07
bytes_in         2.317722e+03
bytes_out        6.093970e+03
dest_port        1.761980e+04
entropy          2.380083e+00
num_pkts_out     1.591114e+01
num_pkts_in      1.132407e+01
src_port         2.000090e+04
total_entropy    4.789522e+04
duration         6.205439e+00
dtype: float64


In [26]:
std.to_json(r"/home/sd-work/Code/Minor-Project/std.json")
mean.to_json(r"/home/sd-work/Code/Minor-Project/mean.json")

In [28]:
X_train.to_csv(r"/home/sd-work/Code/Minor-Project/Preprocessed/X_train.csv", index = False)
X_test.to_csv(r"/home/sd-work/Code/Minor-Project/Preprocessed/X_test.csv", index = False)
y_train.to_csv(r"/home/sd-work/Code/Minor-Project/Preprocessed/y_train.csv", index = False)
y_test.to_csv(r"/home/sd-work/Code/Minor-Project/Preprocessed/y_test.csv", index = False )