In [2]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
import time

In [None]:
data_directory = 'filelocation'

In [44]:
all_files = [f for f in os.listdir(data_directory) if f.endswith('.csv')]

df_list = []
for file in all_files:
    df1 = pd.read_csv(os.path.join(data_directory, file), delimiter='|', low_memory=False)
    df_list.append(df1)

df = pd.concat(df_list, ignore_index=True)


In [31]:
df = df.drop(['ts', 'uid', 'service', 'duration', 'orig_bytes', 'local_orig', 'local_resp', 'tunnel_parents', 'detailed-label' , 'history', 'resp_bytes'], axis=1)


In [32]:
df = df.drop(["orig_pkts", 'missed_bytes', 'resp_pkts', 'resp_ip_bytes'], axis = 1)

In [39]:
df.columns

Index(['id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto',
       'conn_state', 'orig_ip_bytes', 'label',
       'Time,Source,Destination,Protocol,Length,Source Port,Destination Port,bad_packet'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25011003 entries, 0 to 25011002
Data columns (total 8 columns):
 #   Column         Dtype  
---  ------         -----  
 0   id.orig_h      object 
 1   id.orig_p      float64
 2   id.resp_h      object 
 3   id.resp_p      float64
 4   proto          object 
 5   conn_state     object 
 6   orig_ip_bytes  float64
 7   label          object 
dtypes: float64(3), object(5)
memory usage: 1.5+ GB


In [7]:
df.isna().sum()

id.orig_h        0
id.orig_p        0
id.resp_h        0
id.resp_p        0
proto            0
conn_state       0
orig_ip_bytes    0
label            0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,id.orig_p,id.resp_p,orig_ip_bytes
count,25011000.0,25011000.0,25011000.0
mean,39909.62,9710.506,590.0188
std,16091.97,22174.48,687810.3
min,0.0,0.0,0.0
25%,32980.0,23.0,40.0
50%,42993.0,23.0,60.0
75%,52360.0,81.0,180.0
max,65535.0,65535.0,1914793000.0


In [9]:
df

Unnamed: 0,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,conn_state,orig_ip_bytes,label
0,192.168.100.103,51524.0,65.127.233.163,23.0,tcp,S0,180.0,Malicious
1,192.168.100.103,56305.0,63.150.16.171,23.0,tcp,S0,60.0,Malicious
2,192.168.100.103,41101.0,111.40.23.49,23.0,tcp,S0,60.0,Malicious
3,192.168.100.103,60905.0,131.174.215.147,23.0,tcp,S0,180.0,Malicious
4,192.168.100.103,44301.0,91.42.47.63,23.0,tcp,S0,60.0,Malicious
...,...,...,...,...,...,...,...,...
25010998,192.168.100.111,28057.0,173.94.58.125,23.0,tcp,S0,40.0,Malicious
25010999,192.168.100.111,52876.0,69.196.96.231,23.0,tcp,S1,360.0,Benign
25011000,192.168.100.111,39234.0,192.121.45.63,23.0,tcp,S1,220.0,Benign
25011001,168.102.14.4,11.0,192.168.100.111,0.0,icmp,OTH,68.0,Benign


In [10]:
df['proto'] = df['proto'].astype('category')
df['conn_state'] = df['conn_state'].astype('category')
df['label'] = df['label'].astype('category')


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25011003 entries, 0 to 25011002
Data columns (total 8 columns):
 #   Column         Dtype   
---  ------         -----   
 0   id.orig_h      object  
 1   id.orig_p      float64 
 2   id.resp_h      object  
 3   id.resp_p      float64 
 4   proto          category
 5   conn_state     category
 6   orig_ip_bytes  float64 
 7   label          category
dtypes: category(3), float64(3), object(2)
memory usage: 1.0+ GB


In [12]:
df.count()

id.orig_h        25011003
id.orig_p        25011003
id.resp_h        25011003
id.resp_p        25011003
proto            25011003
conn_state       25011003
orig_ip_bytes    25011003
label            25011003
dtype: int64

In [13]:
Q1 = df['orig_ip_bytes'].quantile(0.25)
Q3 = df['orig_ip_bytes'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['orig_ip_bytes'] >= (Q1 - 1.5 * IQR)) & (df['orig_ip_bytes'] <= (Q3 + 1.5 * IQR))]


In [14]:
df.count()

id.orig_h        24917851
id.orig_p        24917851
id.resp_h        24917851
id.resp_p        24917851
proto            24917851
conn_state       24917851
orig_ip_bytes    24917851
label            24917851
dtype: int64

In [15]:
print(df['label'].value_counts())


label
Benign                                   8769642
Malicious                                7044207
Malicious   DDoS                         5708219
Malicious   PartOfAHorizontalPortScan    3386230
Malicious   C&C                             6995
Malicious   Attack                          2558
Malicious   FileDownload                       0
Name: count, dtype: int64


In [16]:
# Strip whitespace from labels
df['label'] = df['label'].str.strip()

# Optional: Grouping labels if needed
label_mapping = {
    'Malicious': 'Malicious',
    'Benign': 'Benign',
    'Malicious   C&C': 'Malicious',
    'Malicious   PartOfAHorizontalPortScan': 'Malicious',
    'Malicious   DDoS': 'Malicious',
    'Malicious   Attack': 'Malicious',
    'Malicious   FileDownload': 'Malicious'
}

# Map the labels
df['label'] = df['label'].map(label_mapping)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map(label_mapping)


In [17]:
print(df['label'].value_counts())


label
Malicious    16148209
Benign        8769642
Name: count, dtype: int64


In [None]:
df['packet_size'] = df['orig_ip_bytes'] / 1024  # Convert bytes to kilobytes


In [19]:
df['label'].unique()

array(['Malicious', 'Benign'], dtype=object)

In [20]:
df

Unnamed: 0,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,conn_state,orig_ip_bytes,label,packet_size
0,192.168.100.103,51524.0,65.127.233.163,23.0,tcp,S0,180.0,Malicious,0.175781
1,192.168.100.103,56305.0,63.150.16.171,23.0,tcp,S0,60.0,Malicious,0.058594
2,192.168.100.103,41101.0,111.40.23.49,23.0,tcp,S0,60.0,Malicious,0.058594
3,192.168.100.103,60905.0,131.174.215.147,23.0,tcp,S0,180.0,Malicious,0.175781
4,192.168.100.103,44301.0,91.42.47.63,23.0,tcp,S0,60.0,Malicious,0.058594
...,...,...,...,...,...,...,...,...,...
25010998,192.168.100.111,28057.0,173.94.58.125,23.0,tcp,S0,40.0,Malicious,0.039062
25010999,192.168.100.111,52876.0,69.196.96.231,23.0,tcp,S1,360.0,Benign,0.351562
25011000,192.168.100.111,39234.0,192.121.45.63,23.0,tcp,S1,220.0,Benign,0.214844
25011001,168.102.14.4,11.0,192.168.100.111,0.0,icmp,OTH,68.0,Benign,0.066406


In [21]:
df.describe()

Unnamed: 0,id.orig_p,id.resp_p,orig_ip_bytes,packet_size
count,24917850.0,24917850.0,24917850.0,24917850.0
mean,39928.96,9734.842,97.5737,0.09528682
std,16082.18,22202.03,94.3514,0.09214003
min,0.0,0.0,0.0,0.0
25%,33013.0,23.0,40.0,0.0390625
50%,43012.0,23.0,60.0,0.05859375
75%,52367.0,81.0,180.0,0.1757812
max,65535.0,65535.0,390.0,0.3808594


In [22]:
scaler = MinMaxScaler()
df['orig_ip_bytes'] = scaler.fit_transform(df[['orig_ip_bytes']])

df = pd.get_dummies(df, columns=['proto'], prefix='proto', drop_first=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['orig_ip_bytes'] = scaler.fit_transform(df[['orig_ip_bytes']])


In [23]:
df

Unnamed: 0,id.orig_h,id.orig_p,id.resp_h,id.resp_p,conn_state,orig_ip_bytes,label,packet_size,proto_icmp,proto_tcp,proto_udp
0,192.168.100.103,51524.0,65.127.233.163,23.0,S0,0.461538,Malicious,0.175781,False,True,False
1,192.168.100.103,56305.0,63.150.16.171,23.0,S0,0.153846,Malicious,0.058594,False,True,False
2,192.168.100.103,41101.0,111.40.23.49,23.0,S0,0.153846,Malicious,0.058594,False,True,False
3,192.168.100.103,60905.0,131.174.215.147,23.0,S0,0.461538,Malicious,0.175781,False,True,False
4,192.168.100.103,44301.0,91.42.47.63,23.0,S0,0.153846,Malicious,0.058594,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...
25010998,192.168.100.111,28057.0,173.94.58.125,23.0,S0,0.102564,Malicious,0.039062,False,True,False
25010999,192.168.100.111,52876.0,69.196.96.231,23.0,S1,0.923077,Benign,0.351562,False,True,False
25011000,192.168.100.111,39234.0,192.121.45.63,23.0,S1,0.564103,Benign,0.214844,False,True,False
25011001,168.102.14.4,11.0,192.168.100.111,0.0,OTH,0.174359,Benign,0.066406,True,False,False


In [24]:
df.describe()

Unnamed: 0,id.orig_p,id.resp_p,orig_ip_bytes,packet_size
count,24917850.0,24917850.0,24917850.0,24917850.0
mean,39928.96,9734.842,0.250189,0.09528682
std,16082.18,22202.03,0.2419267,0.09214003
min,0.0,0.0,0.0,0.0
25%,33013.0,23.0,0.1025641,0.0390625
50%,43012.0,23.0,0.1538462,0.05859375
75%,52367.0,81.0,0.4615385,0.1757812
max,65535.0,65535.0,1.0,0.3808594


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24917851 entries, 0 to 25011002
Data columns (total 11 columns):
 #   Column         Dtype   
---  ------         -----   
 0   id.orig_h      object  
 1   id.orig_p      float64 
 2   id.resp_h      object  
 3   id.resp_p      float64 
 4   conn_state     category
 5   orig_ip_bytes  float64 
 6   label          object  
 7   packet_size    float64 
 8   proto_icmp     bool    
 9   proto_tcp      bool    
 10  proto_udp      bool    
dtypes: bool(3), category(1), float64(4), object(3)
memory usage: 1.6+ GB


In [26]:
df.to_csv('processed_data.csv', index=False)  # Save without row numbers (index)


In [3]:
df = pd.read_csv( 'C:/Users/calvi/projects/cnsproject/processed_data.csv')

In [9]:

benign_data = df[df['label'] == 'Benign']
malicious_data = df[df['label'] == 'Malicious']


malicious_half_sample = malicious_data.sample(frac=0.6, random_state=42)


sampled_df = pd.concat([benign_data, malicious_half_sample])


sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)


print(f"Sampled DataFrame shape: {sampled_df.shape}")
print(sampled_df.head())

sampled_df.to_csv('sampled_dataset.csv', index=False)


Sampled DataFrame shape: (18458567, 11)
         id.orig_h  id.orig_p        id.resp_h  id.resp_p conn_state  \
0    192.168.1.196    58408.0  222.156.149.176       23.0         S0   
1  192.168.100.103    43763.0  124.237.246.139    42675.0         S0   
2    192.168.1.196    35912.0    86.96.167.160       23.0         S0   
3    192.168.1.196    40726.0  141.108.157.129       23.0         S0   
4    192.168.1.196    40822.0   187.72.138.138       23.0         S0   

   orig_ip_bytes   label  packet_size  proto_icmp  proto_tcp  proto_udp  
0       0.153846  Benign     0.058594       False       True      False  
1       0.102564  Benign     0.039062       False      False       True  
2       0.461538  Benign     0.175781       False       True      False  
3       0.153846  Benign     0.058594       False       True      False  
4       0.461538  Benign     0.175781       False       True      False  


In [10]:
sampled_df['label'].value_counts()

label
Malicious    9688925
Benign       8769642
Name: count, dtype: int64