# Packet Cleaning
In this notebook, we will access the various network captures for processing and cleaning.

In [94]:
import pandas as pd
import glob
import warnings
# Suppress FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

## Clean Packet Capture

In [95]:
# List of all csv in a dir
csv_files = glob.glob('modbus_dataset/clean/*.csv')

clean_df = pd.DataFrame()

# append to combined dataframe
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    capture = csv_file.split('\\')[-1].replace('.csv', '')
    df['CaptureName'] = capture
    clean_df = pd.concat([clean_df, df])

clean_df['AttackName'] = 'Clean'

In [96]:
# Convert EpochTime to datetime format and create 'date_time' column
clean_df['DateTime'] = pd.to_datetime(clean_df['EpochTime'], unit='s', utc=True)

In [97]:
# Check for missing values
missing_values = clean_df.isnull().sum()
print(missing_values)

No.                     0
Time                    0
SrcIP                   0
DstIP                   0
Protocol                0
Length                  0
Info                    0
SrcMAC                  0
SrcMACResolved          0
SrcOUIResolved          0
SrcPort             40771
SequenceNumber      46224
SrcOUI                  0
DstMAC                  0
DstMACResolved          0
DstOUI                  0
DstOUResolved       35895
DstPort             40771
DstMACResolved.1        0
SYNFlag             46224
ACKFlag             46224
ProtocolType            0
EpochTime               0
RelativeTime            0
TimeDelta               0
CaptureName             0
AttackName              0
DateTime                0
dtype: int64


In [98]:
# Count the number of packets where SrcPort is not applicable (null)
srcport_not_applicable = clean_df['SrcPort'].isnull().sum()
print(f"Number of packets where SrcPort is not applicable (null): {srcport_not_applicable}")

# Count the number of packets where DstPort is not applicable (null)
dstport_not_applicable = clean_df['DstPort'].isnull().sum()
print(f"Number of packets where DstPort is not applicable (null): {dstport_not_applicable}")

# Count the number of packets where SrcPort should be applicable but is null
srcport_missing = clean_df[(clean_df['Protocol'] == 'TCP') | (clean_df['Protocol'] == 'UDP')]['SrcPort'].isnull().sum()
print(f"Number of packets where SrcPort should be applicable but is null: {srcport_missing}")

# Count the number of packets where DstPort should be applicable but is null
dstport_missing = clean_df[(clean_df['Protocol'] == 'TCP') | (clean_df['Protocol'] == 'UDP')]['DstPort'].isnull().sum()
print(f"Number of packets where DstPort should be applicable but is null: {dstport_missing}")


Number of packets where SrcPort is not applicable (null): 40771
Number of packets where DstPort is not applicable (null): 40771
Number of packets where SrcPort should be applicable but is null: 0
Number of packets where DstPort should be applicable but is null: 0


In [99]:
# Count the number of packets where SequenceNumber is not applicable (null)
seqnum_not_applicable = clean_df['SequenceNumber'].isnull().sum()
print(f"Number of packets where SequenceNumber is not applicable (null): {seqnum_not_applicable}")

# Count the number of packets where SequenceNumber should be applicable but is null
seqnum_missing = clean_df[(clean_df['Protocol'] == 'TCP')]['SequenceNumber'].isnull().sum()
print(f"Number of packets where SequenceNumber should be applicable but is null: {seqnum_missing}")


Number of packets where SequenceNumber is not applicable (null): 46224
Number of packets where SequenceNumber should be applicable but is null: 0


In [100]:
# Count the number of packets where SYNFlag is not applicable (null)
synflag_not_applicable = clean_df['SYNFlag'].isnull().sum()
print(f"Number of packets where SYNFlag is not applicable (null): {synflag_not_applicable}")

# Count the number of packets where ACKFlag is not applicable (null)
ackflag_not_applicable = clean_df['ACKFlag'].isnull().sum()
print(f"Number of packets where ACKFlag is not applicable (null): {ackflag_not_applicable}")

# Count the number of packets where SYNFlag should be applicable but is null
synflag_missing = clean_df[(clean_df['Protocol'] == 'TCP')]['SYNFlag'].isnull().sum()
print(f"Number of packets where SYNFlag should be applicable but is null: {synflag_missing}")

# Count the number of packets where ACKFlag should be applicable but is null
ackflag_missing = clean_df[(clean_df['Protocol'] == 'TCP')]['ACKFlag'].isnull().sum()
print(f"Number of packets where ACKFlag should be applicable but is null: {ackflag_missing}")


Number of packets where SYNFlag is not applicable (null): 46224
Number of packets where ACKFlag is not applicable (null): 46224
Number of packets where SYNFlag should be applicable but is null: 0
Number of packets where ACKFlag should be applicable but is null: 0


In [101]:
# Count the number of packets where DstOUResolved is not applicable (null)
dstou_not_applicable = clean_df['DstOUResolved'].isnull().sum()
print(f"Number of packets where DstOUResolved is not applicable (null): {dstou_not_applicable}")

# Count the number of packets where DstOUResolved should be applicable but is null
dstou_missing = clean_df['DstOUResolved'].isnull().sum()  # No specific protocol condition since DstOUResolved can be applicable to various protocols
print(f"Number of packets where DstOUResolved should be applicable but is null: {dstou_missing}")


Number of packets where DstOUResolved is not applicable (null): 35895
Number of packets where DstOUResolved should be applicable but is null: 35895


In [102]:
# Count the number of packets where DstOUResolved is not applicable (null)
dstou_not_applicable = clean_df['DstOUResolved'].isnull().sum()
print(f"Number of packets where DstOUResolved is not applicable (null): {dstou_not_applicable}")

# Count the number of packets where DstOUResolved should be applicable but is null
dstou_missing = clean_df[((clean_df['Protocol'] == 'TCP') | (clean_df['Protocol'] == 'UDP')) & clean_df['DstOUResolved'].isnull()].shape[0]
print(f"Number of packets where DstOUResolved should be applicable but is null: {dstou_missing}")


Number of packets where DstOUResolved is not applicable (null): 35895
Number of packets where DstOUResolved should be applicable but is null: 1286


In [103]:

# 1. Handling missing values in SrcPort and DstPort
clean_df['SrcPort'].fillna(-1, inplace=True)  # Filling missing SrcPort with -1
clean_df['DstPort'].fillna(-1, inplace=True)  # Filling missing DstPort with -1

# 2. Handling missing values in SequenceNumber
# Assuming TCP packets have SequenceNumber and using median imputation
median_seqnum = clean_df[clean_df['Protocol'] == 'TCP']['SequenceNumber'].median()
clean_df['SequenceNumber'].fillna(median_seqnum, inplace=True)

# 3. Handling missing values in DstOUResolved
clean_df['DstOUResolved'].fillna('Unknown', inplace=True)  # Filling missing DstOUResolved with 'Unknown'

# 4. Handling missing values in SYNFlag and ACKFlag
clean_df['SYNFlag'].fillna(False, inplace=True)  # Filling missing SYNFlag with False
clean_df['ACKFlag'].fillna(False, inplace=True)  # Filling missing ACKFlag with False

### Clean packets after processing

In [104]:
clean_df

Unnamed: 0,No.,Time,SrcIP,DstIP,Protocol,Length,Info,SrcMAC,SrcMACResolved,SrcOUIResolved,...,DstMACResolved.1,SYNFlag,ACKFlag,ProtocolType,EpochTime,RelativeTime,TimeDelta,CaptureName,AttackName,DateTime
0,1,0.000000,172.27.224.70,172.27.224.250,TCP,60,49499 > 502 [ACK] Seq=1 Ack=1 Win=65051 Len=0,00:0c:29:9d:9e:9e,VMware_9d:9e:9e,"VMware, Inc.",...,Telemech_09:51:3b,Not set,Set,TCP,1.535046e+09,0.000000,0.000000,"eth2dump-clean-0,5h_1",Clean,2018-08-23 17:40:48.376131058+00:00
1,2,0.020940,HewlettP_8e:40:b3,Spanning-tree-(for-bridges)_00,STP,64,RST. Root = 32768/0/00:18:6e:d7:8a:c0 Cost = ...,d0:7e:28:8e:40:b3,HewlettP_8e:40:b3,Hewlett Packard,...,Spanning-tree-(for-bridges)_00,False,False,STP,1.535046e+09,0.020940,0.020940,"eth2dump-clean-0,5h_1",Clean,2018-08-23 17:40:48.397070885+00:00
2,3,0.094309,172.27.224.70,172.27.224.250,Modbus/TCP,66,"Query: Trans: 0; Unit: 1, Func: 3: ...",00:0c:29:9d:9e:9e,VMware_9d:9e:9e,"VMware, Inc.",...,Telemech_09:51:3b,Not set,Set,Modbus/TCP,1.535046e+09,0.094309,0.073369,"eth2dump-clean-0,5h_1",Clean,2018-08-23 17:40:48.470439911+00:00
3,4,0.097427,172.27.224.250,172.27.224.70,Modbus/TCP,85,"Response: Trans: 0; Unit: 1, Func: 3: ...",00:80:f4:09:51:3b,Telemech_09:51:3b,Telemechanique Electrique,...,VMware_9d:9e:9e,Not set,Set,Modbus/TCP,1.535046e+09,0.097427,0.003118,"eth2dump-clean-0,5h_1",Clean,2018-08-23 17:40:48.473557949+00:00
4,5,0.311972,172.27.224.70,172.27.224.250,TCP,60,49499 > 502 [ACK] Seq=13 Ack=32 Win=65020 Len=0,00:0c:29:9d:9e:9e,VMware_9d:9e:9e,"VMware, Inc.",...,Telemech_09:51:3b,Not set,Set,TCP,1.535046e+09,0.311972,0.214545,"eth2dump-clean-0,5h_1",Clean,2018-08-23 17:40:48.688102961+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427837,427838,21598.650849,172.27.224.250,172.27.224.70,Modbus/TCP,85,"Response: Trans: 0; Unit: 1, Func: 3: ...",00:80:f4:09:51:3b,Telemech_09:51:3b,Telemechanique Electrique,...,VMware_9d:9e:9e,Not set,Set,Modbus/TCP,1.536452e+09,21598.650849,0.010907,eth2dump-clean-6h_1,Clean,2018-09-09 00:14:03.629414082+00:00
427838,427839,21598.857675,172.27.224.70,172.27.224.250,TCP,60,49205 > 502 [ACK] Seq=817261 Ack=2111256 Win...,00:0c:29:9d:9e:9e,VMware_9d:9e:9e,"VMware, Inc.",...,Telemech_09:51:3b,Not set,Set,TCP,1.536452e+09,21598.857675,0.206826,eth2dump-clean-6h_1,Clean,2018-09-09 00:14:03.836240053+00:00
427839,427840,21598.952198,172.27.224.70,172.27.224.250,Modbus/TCP,66,"Query: Trans: 0; Unit: 1, Func: 3: ...",00:0c:29:9d:9e:9e,VMware_9d:9e:9e,"VMware, Inc.",...,Telemech_09:51:3b,Not set,Set,Modbus/TCP,1.536452e+09,21598.952198,0.094523,eth2dump-clean-6h_1,Clean,2018-09-09 00:14:03.930763006+00:00
427840,427841,21598.960188,172.27.224.250,172.27.224.70,Modbus/TCP,85,"Response: Trans: 0; Unit: 1, Func: 3: ...",00:80:f4:09:51:3b,Telemech_09:51:3b,Telemechanique Electrique,...,VMware_9d:9e:9e,Not set,Set,Modbus/TCP,1.536452e+09,21598.960188,0.007990,eth2dump-clean-6h_1,Clean,2018-09-09 00:14:03.938752890+00:00


In [105]:
# Check for missing values
missing_values = clean_df.isnull().sum()
print(missing_values)

No.                 0
Time                0
SrcIP               0
DstIP               0
Protocol            0
Length              0
Info                0
SrcMAC              0
SrcMACResolved      0
SrcOUIResolved      0
SrcPort             0
SequenceNumber      0
SrcOUI              0
DstMAC              0
DstMACResolved      0
DstOUI              0
DstOUResolved       0
DstPort             0
DstMACResolved.1    0
SYNFlag             0
ACKFlag             0
ProtocolType        0
EpochTime           0
RelativeTime        0
TimeDelta           0
CaptureName         0
AttackName          0
DateTime            0
dtype: int64


## Man in the Middle Attack

In [106]:
csv_files = glob.glob('modbus_dataset/mitm/*.csv')

mitm_df = pd.DataFrame()

for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    capture = csv_file.split('\\')[-1].replace('.csv', '')
    df['CaptureName'] = capture
    mitm_df = pd.concat([mitm_df, df])

mitm_df['AttackName'] = 'mitm'

In [107]:
# Convert EpochTime to datetime format and create 'date_time' column
mitm_df['DateTime'] = pd.to_datetime(mitm_df['EpochTime'], unit='s')

In [108]:
# Check for missing values
missing_values = mitm_df.isnull().sum()
print(missing_values)

No.                     0
Time                    0
SrcIP                   0
DstIP                   0
Protocol                0
Length                  0
Info                    0
SrcMAC                  0
SrcMACResolved          0
SrcOUIResolved          0
SrcPort             43786
SequenceNumber      47798
SrcOUI                  0
DstMAC                  0
DstMACResolved          0
DstOUI                  0
DstOUResolved       34177
DstPort             43786
DstMACResolved.1        0
SYNFlag             47798
ACKFlag             47798
ProtocolType            0
EpochTime               0
RelativeTime            0
TimeDelta               0
CaptureName             0
AttackName              0
DateTime                0
dtype: int64


In [109]:
# 1. Handling missing values in SrcPort and DstPort
mitm_df['SrcPort'].fillna(-1, inplace=True)  # Filling missing SrcPort with -1
mitm_df['DstPort'].fillna(-1, inplace=True)  # Filling missing DstPort with -1

# 2. Handling missing values in SequenceNumber
# Assuming TCP packets have SequenceNumber and using median imputation
median_seqnum = mitm_df[mitm_df['Protocol'] == 'TCP']['SequenceNumber'].median()
mitm_df['SequenceNumber'].fillna(median_seqnum, inplace=True)

# 3. Handling missing values in DstOUResolved
mitm_df['DstOUResolved'].fillna('Unknown', inplace=True)  # Filling missing DstOUResolved with 'Unknown'

# 4. Handling missing values in SYNFlag and ACKFlag
mitm_df['SYNFlag'].fillna(False, inplace=True)  # Filling missing SYNFlag with False
mitm_df['ACKFlag'].fillna(False, inplace=True)  # Filling missing ACKFlag with False

### Mitm after processing

In [110]:
mitm_df.head()

Unnamed: 0,No.,Time,SrcIP,DstIP,Protocol,Length,Info,SrcMAC,SrcMACResolved,SrcOUIResolved,...,DstMACResolved.1,SYNFlag,ACKFlag,ProtocolType,EpochTime,RelativeTime,TimeDelta,CaptureName,AttackName,DateTime
0,1,0.0,fe80::20c:29ff:fee6:1421,ff02::2,ICMPv6,62,Router Solicitation,00:0c:29:e6:14:21,VMware_e6:14:21,"VMware, Inc.",...,IPv6mcast_02,False,False,ICMPv6,1535070000.0,0.0,0.0,eth2dump-mitm-change-15m-1h_1,mitm,2018-08-24 00:16:22.370615005
1,2,0.014983,172.27.224.70,172.27.224.250,TCP,60,49499 > 502 [ACK] Seq=1 Ack=1 Win=65020 Len=0,00:0c:29:9d:9e:9e,VMware_9d:9e:9e,"VMware, Inc.",...,Telemech_09:51:3b,Not set,Set,TCP,1535070000.0,0.014983,0.014983,eth2dump-mitm-change-15m-1h_1,mitm,2018-08-24 00:16:22.385597944
2,3,0.109225,172.27.224.70,172.27.224.250,Modbus/TCP,66,"Query: Trans: 0; Unit: 1, Func: 3: ...",00:0c:29:9d:9e:9e,VMware_9d:9e:9e,"VMware, Inc.",...,Telemech_09:51:3b,Not set,Set,Modbus/TCP,1535070000.0,0.109225,0.094242,eth2dump-mitm-change-15m-1h_1,mitm,2018-08-24 00:16:22.479840040
3,4,0.120609,172.27.224.250,172.27.224.70,Modbus/TCP,85,"Response: Trans: 0; Unit: 1, Func: 3: ...",00:80:f4:09:51:3b,Telemech_09:51:3b,Telemechanique Electrique,...,VMware_9d:9e:9e,Not set,Set,Modbus/TCP,1535070000.0,0.120609,0.011384,eth2dump-mitm-change-15m-1h_1,mitm,2018-08-24 00:16:22.491224051
4,5,0.326988,172.27.224.70,172.27.224.250,TCP,60,49499 > 502 [ACK] Seq=13 Ack=32 Win=64989 Len=0,00:0c:29:9d:9e:9e,VMware_9d:9e:9e,"VMware, Inc.",...,Telemech_09:51:3b,Not set,Set,TCP,1535070000.0,0.326988,0.206379,eth2dump-mitm-change-15m-1h_1,mitm,2018-08-24 00:16:22.697602987


In [111]:
# Check for missing values
missing_values = mitm_df.isnull().sum()
print(missing_values)

No.                 0
Time                0
SrcIP               0
DstIP               0
Protocol            0
Length              0
Info                0
SrcMAC              0
SrcMACResolved      0
SrcOUIResolved      0
SrcPort             0
SequenceNumber      0
SrcOUI              0
DstMAC              0
DstMACResolved      0
DstOUI              0
DstOUResolved       0
DstPort             0
DstMACResolved.1    0
SYNFlag             0
ACKFlag             0
ProtocolType        0
EpochTime           0
RelativeTime        0
TimeDelta           0
CaptureName         0
AttackName          0
DateTime            0
dtype: int64


## modbusQuery2Flooding

In [112]:
csv_files = glob.glob('modbus_dataset/modbusQuery2Flooding/*.csv')

modbusQuery2Flooding_df = pd.DataFrame()

for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file)
        capture = csv_file.split('\\')[-1].replace('.csv', '')
        df['CaptureName'] = capture
        modbusQuery2Flooding_df = pd.concat([modbusQuery2Flooding_df, df])
    except Exception as e:
        print(f"Failed to read {csv_file}: {e}")

modbusQuery2Flooding_df['AttackName'] = 'modbusQuery2Flooding'

Failed to read modbus_dataset/modbusQuery2Flooding\eth2dump-modbusQuery2Flooding30m-1h_1.csv: 'utf-8' codec can't decode byte 0x86 in position 246153: invalid start byte
Failed to read modbus_dataset/modbusQuery2Flooding\eth2dump-modbusQuery2Flooding5m-1h_1.csv: Unable to allocate 33.1 MiB for an array with shape (16, 271113) and data type object


In [113]:
# Convert EpochTime to datetime format and create 'DateTime' column
modbusQuery2Flooding_df['DateTime'] = pd.to_datetime(modbusQuery2Flooding_df['EpochTime'], unit='s', utc=True)

In [114]:
# Check for missing values
missing_values = modbusQuery2Flooding_df.isnull().sum()
print(missing_values)

No.                     0
Time                    0
SrcIP                   0
DstIP                   0
Protocol                0
Length                  0
Info                    0
SrcMAC                  0
SrcMACResolved          0
SrcOUIResolved          0
SrcPort             65687
SequenceNumber      74819
SrcOUI                  0
DstMAC                  0
DstMACResolved          0
DstOUI                  0
DstOUResolved       56798
DstPort             65687
DstMACResolved.1        0
SYNFlag             74819
ACKFlag             74819
ProtocolType            0
EpochTime               0
RelativeTime            0
TimeDelta               0
CaptureName             0
AttackName              0
DateTime                0
dtype: int64


In [115]:
# 1. Handling missing values in SrcPort and DstPort
modbusQuery2Flooding_df['SrcPort'].fillna(-1, inplace=True)  # Filling missing SrcPort with -1
modbusQuery2Flooding_df['DstPort'].fillna(-1, inplace=True)  # Filling missing DstPort with -1

# 2. Handling missing values in SequenceNumber
# Assuming TCP packets have SequenceNumber and using median imputation
median_seqnum = modbusQuery2Flooding_df[modbusQuery2Flooding_df['Protocol'] == 'TCP']['SequenceNumber'].median()
modbusQuery2Flooding_df['SequenceNumber'].fillna(median_seqnum, inplace=True)

# 3. Handling missing values in DstOUResolved
modbusQuery2Flooding_df['DstOUResolved'].fillna('Unknown', inplace=True)  # Filling missing DstOUResolved with 'Unknown'

# 4. Handling missing values in SYNFlag and ACKFlag
modbusQuery2Flooding_df['SYNFlag'].fillna(False, inplace=True)  # Filling missing SYNFlag with False
modbusQuery2Flooding_df['ACKFlag'].fillna(False, inplace=True)  # Filling missing ACKFlag with False

MemoryError: Unable to allocate 32.5 MiB for an array with shape (4, 1066596) and data type object

### ModbusQuery2Flooding after processing

In [None]:
modbusQuery2Flooding_df.head()

Unnamed: 0,No.,Time,SrcIP,DstIP,Protocol,Length,Info,SrcMAC,SrcMACResolved,SrcOUIResolved,...,DstMACResolved.1,SYNFlag,ACKFlag,ProtocolType,EpochTime,RelativeTime,TimeDelta,CaptureName,AttackName,DateTime
0,1,0.0,ASUSTekC_64:40:79,Broadcast,ARP,60,Who has 172.27.224.250? Tell 172.27.224.251,48:5b:39:64:40:79,ASUSTekC_64:40:79,ASUSTek COMPUTER INC.,...,Broadcast,False,False,ARP,1529553000.0,0.0,0.0,eth2dump-modbusQuery2Flooding-30m-12h_1,modbusQuery2Flooding,2018-06-21 03:42:29.676654100+00:00
1,2,0.00239,Telemech_09:51:3b,ASUSTekC_64:40:79,ARP,64,172.27.224.250 is at 00:80:f4:09:51:3b,00:80:f4:09:51:3b,Telemech_09:51:3b,Telemechanique Electrique,...,ASUSTekC_64:40:79,False,False,ARP,1529553000.0,0.00239,0.00239,eth2dump-modbusQuery2Flooding-30m-12h_1,modbusQuery2Flooding,2018-06-21 03:42:29.679044008+00:00
2,3,0.002402,172.27.224.251,172.27.224.250,TCP,60,49444 > 502 [SYN] Seq=0 Win=2048 Len=0 MSS=1460,48:5b:39:64:40:79,ASUSTekC_64:40:79,ASUSTek COMPUTER INC.,...,Telemech_09:51:3b,Set,Not set,TCP,1529553000.0,0.002402,1.2e-05,eth2dump-modbusQuery2Flooding-30m-12h_1,modbusQuery2Flooding,2018-06-21 03:42:29.679055929+00:00
3,4,0.012608,172.27.224.250,172.27.224.251,TCP,60,"502 > 49444 [SYN, ACK] Seq=0 Ack=1 Win=8192 ...",00:80:f4:09:51:3b,Telemech_09:51:3b,Telemechanique Electrique,...,ASUSTekC_64:40:79,Set,Set,TCP,1529553000.0,0.012608,0.010206,eth2dump-modbusQuery2Flooding-30m-12h_1,modbusQuery2Flooding,2018-06-21 03:42:29.689261913+00:00
4,5,0.012618,172.27.224.251,172.27.224.250,TCP,60,49444 > 502 [ACK] Seq=1 Ack=1 Win=2048 Len=0,48:5b:39:64:40:79,ASUSTekC_64:40:79,ASUSTek COMPUTER INC.,...,Telemech_09:51:3b,Not set,Set,TCP,1529553000.0,0.012618,1e-05,eth2dump-modbusQuery2Flooding-30m-12h_1,modbusQuery2Flooding,2018-06-21 03:42:29.689271927+00:00


In [None]:
# Check for missing values
missing_values = modbusQuery2Flooding_df.isnull().sum()
print(missing_values)

No.                 0
Time                0
SrcIP               0
DstIP               0
Protocol            0
Length              0
Info                0
SrcMAC              0
SrcMACResolved      0
SrcOUIResolved      0
SrcPort             0
SequenceNumber      0
SrcOUI              0
DstMAC              0
DstMACResolved      0
DstOUI              0
DstOUResolved       0
DstPort             0
DstMACResolved.1    0
SYNFlag             0
ACKFlag             0
ProtocolType        0
EpochTime           0
RelativeTime        0
TimeDelta           0
CaptureName         0
AttackName          0
DateTime            0
dtype: int64


## modbusQueryFlooding

In [None]:
csv_files = glob.glob('modbus_dataset/modbusQueryFlooding/*.csv')

modbusQueryFlooding_df = pd.DataFrame()

for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file)
        capture = csv_file.split('/')[-1].replace('.csv', '')
        df['CaptureName'] = capture
        modbusQueryFlooding_df = pd.concat([modbusQueryFlooding_df, df])
    except Exception as e:
        print(f"Failed to read {csv_file}: {e}")

modbusQueryFlooding_df['AttackName'] = 'modbusQueryFlooding'

In [None]:
# Convert EpochTime to datetime format and create 'DateTime' column
modbusQueryFlooding_df['DateTime'] = pd.to_datetime(modbusQueryFlooding_df['EpochTime'], unit='s', utc=True)

In [None]:
# Check for missing values
missing_values = modbusQueryFlooding_df.isnull().sum()
print(missing_values)

No.                     0
Time                    0
SrcIP                   0
DstIP                   0
Protocol                0
Length                  0
Info                    0
SrcMAC                  0
SrcMACResolved          0
SrcOUIResolved          0
SrcPort             39195
SequenceNumber      40129
SrcOUI                  0
DstMAC                  0
DstMACResolved          0
DstOUI                  0
DstOUResolved       12207
DstPort             39195
DstMACResolved.1        0
SYNFlag             40129
ACKFlag             40129
ProtocolType            0
EpochTime               0
TimeDelta           40129
RelativeTime            0
CaptureName             0
AttackName              0
DateTime                0
dtype: int64


In [None]:
# Count the number of packets where TimeDelta should be present but is null
timedelta_missing = modbusQueryFlooding_df[modbusQueryFlooding_df['TimeDelta'].isnull() & ((modbusQueryFlooding_df['Time'].notnull()) & (modbusQueryFlooding_df['EpochTime'].notnull()))].shape[0]
print(f"Number of packets where TimeDelta should be present but is null: {timedelta_missing}")


Number of packets where TimeDelta should be present but is null: 40129


In [None]:
# Sort the DataFrame by EpochTime to ensure the data is in chronological order
modbusQueryFlooding_df.sort_values(by='EpochTime', inplace=True)

# Interpolate missing TimeDelta values based on the EpochTime column
modbusQueryFlooding_df['TimeDelta'] = modbusQueryFlooding_df['EpochTime'].diff().fillna(method='backfill')

# Check if there are any remaining null values in TimeDelta
null_timedelta_count = modbusQueryFlooding_df['TimeDelta'].isnull().sum()
print(f"Number of remaining null values in TimeDelta after interpolation: {null_timedelta_count}")

Number of remaining null values in TimeDelta after interpolation: 0


In [None]:
# 1. Handling missing values in SrcPort and DstPort
modbusQueryFlooding_df['SrcPort'].fillna(-1, inplace=True)  # Filling missing SrcPort with -1
modbusQueryFlooding_df['DstPort'].fillna(-1, inplace=True)  # Filling missing DstPort with -1

# 2. Handling missing values in SequenceNumber
# Assuming TCP packets have SequenceNumber and using median imputation
median_seqnum = modbusQueryFlooding_df[modbusQueryFlooding_df['Protocol'] == 'TCP']['SequenceNumber'].median()
modbusQueryFlooding_df['SequenceNumber'].fillna(median_seqnum, inplace=True)

# 3. Handling missing values in DstOUResolved
modbusQueryFlooding_df['DstOUResolved'].fillna('Unknown', inplace=True)  # Filling missing DstOUResolved with 'Unknown'

# 4. Handling missing values in SYNFlag and ACKFlag
modbusQueryFlooding_df['SYNFlag'].fillna(False, inplace=True)  # Filling missing SYNFlag with False
modbusQueryFlooding_df['ACKFlag'].fillna(False, inplace=True)  # Filling missing ACKFlag with False

### ModbusQueryFlooding after processing

In [None]:
modbusQueryFlooding_df.head()

Unnamed: 0,No.,Time,SrcIP,DstIP,Protocol,Length,Info,SrcMAC,SrcMACResolved,SrcOUIResolved,...,DstMACResolved.1,SYNFlag,ACKFlag,ProtocolType,EpochTime,TimeDelta,RelativeTime,CaptureName,AttackName,DateTime
0,1,0.0,172.27.224.70,172.27.224.250,Modbus/TCP,66,"Query: Trans: 0; Unit: 1, Func: 3: ...",VMware_9d:9e:9e,VMware_9d:9e:9e,"VMware, Inc.",...,Telemecaniqu_09:51:3b,Not set,Set,Modbus/TCP,1527075000.0,0.010388,0.0,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-23 11:26:39.256021023+00:00
1,2,0.010388,172.27.224.250,172.27.224.70,Modbus/TCP,85,"Response: Trans: 0; Unit: 1, Func: 3: ...",Telemecaniqu_09:51:3b,Telemecaniqu_09:51:3b,Telemecanique Electrique,...,VMware_9d:9e:9e,Not set,Set,Modbus/TCP,1527075000.0,0.010388,0.010388,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-23 11:26:39.266408920+00:00
2,3,0.213857,172.27.224.70,172.27.224.250,TCP,60,50573 > 502 [ACK] Seq=13 Ack=32 Win=65051 Len=0,VMware_9d:9e:9e,VMware_9d:9e:9e,"VMware, Inc.",...,Telemecaniqu_09:51:3b,Not set,Set,TCP,1527075000.0,0.203469,0.213857,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-23 11:26:39.469877958+00:00
3,4,0.309292,172.27.224.70,172.27.224.250,Modbus/TCP,66,"Query: Trans: 0; Unit: 1, Func: 3: ...",VMware_9d:9e:9e,VMware_9d:9e:9e,"VMware, Inc.",...,Telemecaniqu_09:51:3b,Not set,Set,Modbus/TCP,1527075000.0,0.095435,0.309292,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-23 11:26:39.565313101+00:00
4,5,0.321041,172.27.224.250,172.27.224.70,Modbus/TCP,85,"Response: Trans: 0; Unit: 1, Func: 3: ...",Telemecaniqu_09:51:3b,Telemecaniqu_09:51:3b,Telemecanique Electrique,...,VMware_9d:9e:9e,Not set,Set,Modbus/TCP,1527075000.0,0.011749,0.321041,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-23 11:26:39.577061892+00:00


In [None]:
# Check for missing values
missing_values = modbusQueryFlooding_df.isnull().sum()
print(missing_values)

No.                 0
Time                0
SrcIP               0
DstIP               0
Protocol            0
Length              0
Info                0
SrcMAC              0
SrcMACResolved      0
SrcOUIResolved      0
SrcPort             0
SequenceNumber      0
SrcOUI              0
DstMAC              0
DstMACResolved      0
DstOUI              0
DstOUResolved       0
DstPort             0
DstMACResolved.1    0
SYNFlag             0
ACKFlag             0
ProtocolType        0
EpochTime           0
TimeDelta           0
RelativeTime        0
CaptureName         0
AttackName          0
DateTime            0
dtype: int64


## TcpSYNFlood

In [None]:
csv_files = glob.glob('modbus_dataset/tcpSYNFloodDDoS/*.csv')

# Create an empty dataframe to store the combined data
tcpSYNFlood_df = pd.DataFrame()

# Loop through each CSV file and append its contents to the combined dataframe
for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file)
        capture = csv_file.split('\\')[-1].replace('.csv', '')
        df['CaptureName'] = capture
        tcpSYNFlood_df = pd.concat([tcpSYNFlood_df, df])
    except Exception as e:
        print(f"Failed to read {csv_file}: {e}")

tcpSYNFlood_df['AttackName'] = 'tcpSYNFloodDDoS'

Failed to read modbus_dataset/tcpSYNFloodDDoS\eth2dump-tcpSYNFloodDDoS30m-6h_1.csv: 'utf-8' codec can't decode byte 0x86 in position 261916: invalid start byte


In [None]:
# Convert EpochTime to datetime format and create 'date_time' column
tcpSYNFlood_df['DateTime'] = pd.to_datetime(tcpSYNFlood_df['EpochTime'], unit='s', utc=True)

In [None]:
# Check for missing values
missing_values = tcpSYNFlood_df.isnull().sum()
print(missing_values)

No.                     0
Time                    0
SrcIP                   0
DstIP                   0
Protocol                0
Length                  0
Info                  189
SrcMAC                  0
SrcMACResolved          0
SrcOUIResolved          0
SrcPort             12129
SequenceNumber      12417
SrcOUI                  0
DstMAC                  0
DstMACResolved          0
DstOUI                  0
DstOUResolved        9198
DstPort             12129
DstMACResolved.1        0
SYNFlag             12417
ACKFlag             12417
ProtocolType            0
EpochTime               0
RelativeTime            0
TimeDelta               0
CaptureName             0
AttackName              0
DateTime                0
dtype: int64


In [None]:
# 1. Handling missing values in SrcPort and DstPort
tcpSYNFlood_df['SrcPort'].fillna(-1, inplace=True)  # Filling missing SrcPort with -1
tcpSYNFlood_df['DstPort'].fillna(-1, inplace=True)  # Filling missing DstPort with -1

# 2. Handling missing values in SequenceNumber
# Assuming TCP packets have SequenceNumber and using median imputation
median_seqnum = tcpSYNFlood_df[tcpSYNFlood_df['Protocol'] == 'TCP']['SequenceNumber'].median()
tcpSYNFlood_df['SequenceNumber'].fillna(median_seqnum, inplace=True)

# 3. Handling missing values in DstOUResolved
tcpSYNFlood_df['DstOUResolved'].fillna('Unknown', inplace=True)  # Filling missing DstOUResolved with 'Unknown'

# 4. Handling missing values in SYNFlag and ACKFlag
tcpSYNFlood_df['SYNFlag'].fillna(False, inplace=True)  # Filling missing SYNFlag with False
tcpSYNFlood_df['ACKFlag'].fillna(False, inplace=True)  # Filling missing ACKFlag with False

# Fill missing Info values with a placeholder like "Unknown"
df['Info'].fillna('Unknown', inplace=True)

### TcpSYNFlood after processing

In [None]:
tcpSYNFlood_df.head()

Unnamed: 0,No.,Time,SrcIP,DstIP,Protocol,Length,Info,SrcMAC,SrcMACResolved,SrcOUIResolved,...,DstMACResolved.1,SYNFlag,ACKFlag,ProtocolType,EpochTime,RelativeTime,TimeDelta,CaptureName,AttackName,DateTime
0,1,0.0,ASUSTekC_64:40:79,Broadcast,ARP,60,Who has 172.27.224.250? Tell 172.27.224.251,48:5b:39:64:40:79,ASUSTekC_64:40:79,ASUSTek COMPUTER INC.,...,Broadcast,False,False,ARP,1526917000.0,0.0,0.0,eth2dump-tcpSYNFloodDDoS15m-1h_1,tcpSYNFloodDDoS,2018-05-21 15:32:57.012079+00:00
1,2,0.002408,Telemech_09:51:3b,ASUSTekC_64:40:79,ARP,64,172.27.224.250 is at 00:80:f4:09:51:3b,00:80:f4:09:51:3b,Telemech_09:51:3b,Telemechanique Electrique,...,ASUSTekC_64:40:79,False,False,ARP,1526917000.0,0.002408,0.002408,eth2dump-tcpSYNFloodDDoS15m-1h_1,tcpSYNFloodDDoS,2018-05-21 15:32:57.014487028+00:00
2,3,0.00242,172.27.224.251,172.27.224.250,TCP,60,57424 > 502 [SYN] Seq=0 Win=2048 Len=0 MSS=1460,48:5b:39:64:40:79,ASUSTekC_64:40:79,ASUSTek COMPUTER INC.,...,Telemech_09:51:3b,Set,Not set,TCP,1526917000.0,0.00242,1.2e-05,eth2dump-tcpSYNFloodDDoS15m-1h_1,tcpSYNFloodDDoS,2018-05-21 15:32:57.014498949+00:00
3,4,0.012697,172.27.224.250,172.27.224.251,TCP,60,"502 > 57424 [SYN, ACK] Seq=0 Ack=1 Win=8192 ...",00:80:f4:09:51:3b,Telemech_09:51:3b,Telemechanique Electrique,...,ASUSTekC_64:40:79,Set,Set,TCP,1526917000.0,0.012697,0.010277,eth2dump-tcpSYNFloodDDoS15m-1h_1,tcpSYNFloodDDoS,2018-05-21 15:32:57.024775982+00:00
4,5,0.012714,172.27.224.251,172.27.224.250,TCP,60,57424 > 502 [ACK] Seq=1 Ack=1 Win=2048 Len=0,48:5b:39:64:40:79,ASUSTekC_64:40:79,ASUSTek COMPUTER INC.,...,Telemech_09:51:3b,Not set,Set,TCP,1526917000.0,0.012714,1.7e-05,eth2dump-tcpSYNFloodDDoS15m-1h_1,tcpSYNFloodDDoS,2018-05-21 15:32:57.024792910+00:00


In [None]:
# Check for missing values
missing_values = tcpSYNFlood_df.isnull().sum()
print(missing_values)

No.                   0
Time                  0
SrcIP                 0
DstIP                 0
Protocol              0
Length                0
Info                189
SrcMAC                0
SrcMACResolved        0
SrcOUIResolved        0
SrcPort               0
SequenceNumber        0
SrcOUI                0
DstMAC                0
DstMACResolved        0
DstOUI                0
DstOUResolved         0
DstPort               0
DstMACResolved.1      0
SYNFlag               0
ACKFlag               0
ProtocolType          0
EpochTime             0
RelativeTime          0
TimeDelta             0
CaptureName           0
AttackName            0
DateTime              0
dtype: int64


## pingFloodDDoS

In [116]:
csv_files = glob.glob('modbus_dataset/pingFloodDDoS/*.csv')

pingFloodDDos_df = pd.DataFrame()

for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file)
        capture = csv_file.split('\\')[-1].replace('.csv', '')
        df['CaptureName'] = capture
        pingFloodDDos_df = pd.concat([pingFloodDDos_df, df])
    except Exception as e:
        print(f"Failed to read {csv_file}: {e}")

pingFloodDDos_df['AttackName'] = 'pingFloodDDoS'


pingFloodDDos_df

Failed to read modbus_dataset/pingFloodDDoS\eth2dump-pingFloodDDoS30m-6h_1.csv: Unable to allocate 5.76 MiB for an array with shape (755159,) and data type object


In [None]:
# Convert EpochTime to datetime format and create 'date_time' column
pingFloodDDos_df['DateTime'] = pd.to_datetime(pingFloodDDos_df['EpochTime'], unit='s', utc=True)

KeyError: 'EpochTime'

In [None]:
# Check for missing values
missing_values = pingFloodDDos_df.isnull().sum()
print(missing_values)

AttackName    92588
DateTime          0
dtype: int64


In [None]:
# 1. Handling missing values in SrcPort and DstPort
pingFloodDDos_df['SrcPort'].fillna(-1, inplace=True)  # Filling missing SrcPort with -1
pingFloodDDos_df['DstPort'].fillna(-1, inplace=True)  # Filling missing DstPort with -1

# 2. Handling missing values in SequenceNumber
# Assuming TCP packets have SequenceNumber and using median imputation
median_seqnum = pingFloodDDos_df[pingFloodDDos_df['Protocol'] == 'TCP']['SequenceNumber'].median()
pingFloodDDos_df['SequenceNumber'].fillna(median_seqnum, inplace=True)

# 3. Handling missing values in DstOUResolved
pingFloodDDos_df['DstOUResolved'].fillna('Unknown', inplace=True)  # Filling missing DstOUResolved with 'Unknown'

# 4. Handling missing values in SYNFlag and ACKFlag
pingFloodDDos_df['SYNFlag'].fillna(False, inplace=True)  # Filling missing SYNFlag with False
pingFloodDDos_df['ACKFlag'].fillna(False, inplace=True)  # Filling missing ACKFlag with False


KeyError: 'SrcPort'

### pingFloodDDos after processing

In [None]:
pingFloodDDos_df

In [None]:
# Check for missing values
missing_values = pingFloodDDos_df.isnull().sum()
print(missing_values)

No.                 0
Time                0
SrcIP               0
DstIP               0
Protocol            0
Length              0
Info                0
SrcMAC              0
SrcMACResolved      0
SrcOUIResolved      0
SrcPort             0
SequenceNumber      0
SrcOUI              0
DstMAC              0
DstMACResolved      0
DstOUI              0
DstOUResolved       0
DstPort             0
DstMACResolved.1    0
SYNFlag             0
ACKFlag             0
ProtocolType        0
EpochTime           0
RelativeTime        0
TimeDelta           0
CaptureName         0
AttackName          0
dtype: int64


In [None]:
# #MITM
# print(len(mitm_df))

# #TCP_SYN
# tcpSYNFlood_df = tcpSYNFlood_df.head(2069563)
# print(len(tcpSYNFlood_df))
# tcpSYNFlood_df

# #PING
# pingFloodDDos_df = pingFloodDDos_df.head(2069563)
# pingFloodDDos_df

# #modbusQueryFlooding
# modbusQueryFlooding_df = modbusQueryFlooding_df.head(2069563)

# #modbusQuery2Flooding_df
# modbusQuery2Flooding_df = modbusQuery2Flooding_df.head(2069563)

# mitm_df.to_csv('mitm.csv', index=False)
# clean_df.to_csv('clean.csv', index=False)
# tcpSYNFlood_df.to_csv('tcpSYNFlood.csv', index=False)
# pingFloodDDos_df.to_csv('pingFloodDDos.csv', index=False)
# modbusQueryFlooding_df.to_csv('modbusQueryFlooding.csv', index=False)
# modbusQuery2Flooding_df.to_csv('modbusQuery2Flooding.csv', index=False)

2069563
2069563


In [None]:
modbusQueryFlooding_df

Unnamed: 0,No.,Time,SrcIP,DstIP,Protocol,Length,Info,SrcMAC,SrcMACResolved,SrcOUIResolved,...,DstMACResolved.1,SYNFlag,ACKFlag,ProtocolType,EpochTime,TimeDelta,RelativeTime,CaptureName,AttackName,date_time
0,1,0.000000,HewlettPacka_8e:40:b3,Spanning-tree-(for-bridges)_00,STP,64,RST. Root = 32768/0/00:18:6e:d7:8a:c0 Cost = ...,HewlettPacka_8e:40:b3,HewlettPacka_8e:40:b3,Hewlett Packard,...,Spanning-tree-(for-bridges)_00,False,False,STP,1.526983e+09,0.036320,0.000000,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-26 00:39:34.756860018
1,2,0.036320,172.27.224.251,172.27.224.250,TCP,60,"50272 > 502 [FIN, ACK] Seq=1 Ack=1 Win=2036 ...",ASUSTekCOMPU_64:40:79,ASUSTekCOMPU_64:40:79,ASUSTek COMPUTER INC.,...,Telemecaniqu_09:51:3b,Not set,Set,TCP,1.526983e+09,0.036320,0.036320,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-26 00:39:34.763644934
2,3,0.046578,172.27.224.250,172.27.224.251,TCP,60,502 > 50272 [ACK] Seq=1 Ack=2 Win=8712 Len=0,Telemecaniqu_09:51:3b,Telemecaniqu_09:51:3b,Telemecanique Electrique,...,ASUSTekCOMPU_64:40:79,Not set,Set,TCP,1.526983e+09,0.010258,0.046578,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-26 00:39:34.858760118
3,4,0.048935,172.27.224.250,172.27.224.251,TCP,60,"502 > 50272 [FIN, ACK] Seq=1 Ack=2 Win=8712 ...",Telemecaniqu_09:51:3b,Telemecaniqu_09:51:3b,Telemecanique Electrique,...,ASUSTekCOMPU_64:40:79,Not set,Set,TCP,1.526983e+09,0.002357,0.048935,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-26 00:39:34.869422913
4,5,0.048946,172.27.224.251,172.27.224.250,TCP,60,50272 > 502 [ACK] Seq=2 Ack=2 Win=2036 Len=0,ASUSTekCOMPU_64:40:79,ASUSTekCOMPU_64:40:79,ASUSTek COMPUTER INC.,...,Telemecaniqu_09:51:3b,Not set,Set,TCP,1.526983e+09,0.000011,0.048946,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-26 00:39:35.075664043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398368,398369,19267.505692,172.27.224.251,172.27.224.250,TCP,60,"57909 > 502 [FIN, ACK] Seq=13 Ack=13 Win=203...",ASUSTekCOMPU_64:40:79,ASUSTekCOMPU_64:40:79,ASUSTek COMPUTER INC.,...,Telemecaniqu_09:51:3b,Not set,Set,TCP,1.527293e+09,0.355063,19267.505692,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-26 03:25:26.499504089
398369,398370,19267.515908,172.27.224.250,172.27.224.251,TCP,60,502 > 57909 [ACK] Seq=13 Ack=14 Win=8712 Len=0,Telemecaniqu_09:51:3b,Telemecaniqu_09:51:3b,Telemecanique Electrique,...,ASUSTekCOMPU_64:40:79,Not set,Set,TCP,1.527293e+09,0.010216,19267.515908,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-26 03:25:26.499751091
398370,398371,19267.516764,172.27.224.250,172.27.224.251,TCP,60,"502 > 57909 [FIN, ACK] Seq=13 Ack=14 Win=871...",Telemecaniqu_09:51:3b,Telemecaniqu_09:51:3b,Telemecanique Electrique,...,ASUSTekCOMPU_64:40:79,Not set,Set,TCP,1.527293e+09,0.000856,19267.516764,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-26 03:25:26.499994993
398371,398372,19267.516784,172.27.224.251,172.27.224.250,TCP,60,57909 > 502 [ACK] Seq=14 Ack=14 Win=2036 Len=0,ASUSTekCOMPU_64:40:79,ASUSTekCOMPU_64:40:79,ASUSTek COMPUTER INC.,...,Telemecaniqu_09:51:3b,Not set,Set,TCP,1.527293e+09,0.000020,19267.516784,modbusQueryFlooding\eth2dump-modbusQueryFloodi...,modbusQueryFlooding,2018-05-26 03:25:26.500250101


In [None]:
print(tcpSYNFlood_df.dtypes)

No.                          int64
Time                       float64
SrcIP                       object
DstIP                       object
Protocol                    object
Length                       int64
Info                        object
SrcMAC                      object
SrcMACResolved              object
SrcOUIResolved              object
SrcPort                    float64
SequenceNumber             float64
SrcOUI                      object
DstMAC                      object
DstMACResolved              object
DstOUI                      object
DstOUResolved               object
DstPort                    float64
DstMACResolved.1            object
SYNFlag                     object
ACKFlag                     object
ProtocolType                object
EpochTime                  float64
RelativeTime               float64
TimeDelta                  float64
CaptureName                 object
AttackName                  object
date_time           datetime64[ns]
dtype: object
