
**This file prepares the data we will need for machine learning.**

It opens specified Wireshark capture files and cleans it up in an appropriate manner.

In [154]:
# Imports
from scapy.all import *
import pandas as pd
import ipaddress
import csv

**The capture files are grabbed from the appropriate directories and labled**

In [155]:
# From captureFiles directory
files = []

dir_list = os.listdir("./captureFilesAttack/")
for each in dir_list:
    files.append(("captureFilesAttack/"+each,"attack"))

dir_list = os.listdir("./captureFilesNormal/")
for each in dir_list:
    files.append(("captureFilesNormal/"+each,"normal"))

files


[('captureFilesAttack/attack.pcap', 'attack'),
 ('captureFilesNormal/http-cnn2011.pcapng', 'normal'),
 ('captureFilesNormal/http-espn2012.pcapng', 'normal'),
 ('captureFilesNormal/http-msnbc.pcapng', 'normal'),
 ('captureFilesNormal/http-wiresharkdownload.pcapng', 'normal'),
 ('captureFilesNormal/ssl_saintcon.pcapng', 'normal')]

The following function is used for:
1. Opening .pcapng files and converting them to a single .csv file
2. Labeling each file's data (a common practice in supervised learning)
3. Ensuring that there are no null values

In [156]:

def processFile(pcap_file, label):
    packets = rdpcap(pcap_file)
    with open("normalizedData/normalized.csv", "a") as f:
        previous_time = None
        for packet in packets:
            # Restricting only to packets with IP (exludes ARP, but does not have to!)
            if packet.haslayer('IP'):
                # TCP-related fields
                if packet.haslayer('TCP'):
                    src_port = packet['TCP'].sport
                    dst_port = packet['TCP'].dport
                    flags = packet['TCP'].flags.value
                    window_size = packet['TCP'].window
                    seq_number = packet['TCP'].seq
                    ack_number = packet['TCP'].ack
                else:
                    src_port, dst_port, flags, window_size, seq_number, ack_number = 0,0,0,0,0,0
                    
                # Ip-related fields
                src_ip = packet['IP'].src
                dst_ip = packet['IP'].dst
                proto = packet['IP'].proto
                len = packet['IP'].len
                ttl = packet['IP'].ttl

                # Getting the time delta
                if previous_time is None:
                    time_delta = 0.0
                else:
                    time_delta = packet.time - previous_time
                previous_time = packet.time
                
                # Writing all to the file
                f.write(f"{src_ip},{dst_ip},{src_port},{dst_port},{proto},{len},{flags},{ttl},{window_size},{seq_number},{ack_number},{time_delta},{label}\n")

Now, actually doing processing on our files

In [157]:
# Making sure the file is clean
file_to_delete = open("normalizedData/normalized.csv",'w')
file_to_delete.write("source,destination,source_port,destination_port,protocol,length,flags,ttl,window_size,seq_number,ack_number,time_delta,label\n")
file_to_delete.close()

In [158]:
# Actually processing the files. This one takes a while.
for pcap_file in files:
    processFile(pcap_file[0], pcap_file[1])

file = pd.read_csv('normalizedData/normalized.csv')
file.head(3)



Unnamed: 0,source,destination,source_port,destination_port,protocol,length,flags,ttl,window_size,seq_number,ack_number,time_delta,label
0,29.235.193.237,10.10.10.10,0,0,1,1297,0,104,0,0,0,0.0,attack
1,160.30.135.89,10.10.10.10,0,0,1,391,0,98,0,0,0,2.6e-05,attack
2,61.102.151.242,10.10.10.10,0,0,1,810,0,45,0,0,0,9e-06,attack


**Preparations**

Since Machine Learning Models cannot work on categorical variables in the form of strings, we have to clean the data in away appropriate for ML algorithms.

So, we have to convert IP addresses from dotted format to decimal

In [159]:
# Converts dotted IP to numeric
def ip_to_numeric(address):
    try:
        return int(ipaddress.ip_address(address))
    except:
        return address

In [160]:
update_file = file

update_file['source'] = update_file['source'].apply(ip_to_numeric)
update_file['destination'] = update_file['destination'].apply(ip_to_numeric)

update_file.to_csv('normalizedData/normalized.csv', index = False)

update_file.head(5)

Unnamed: 0,source,destination,source_port,destination_port,protocol,length,flags,ttl,window_size,seq_number,ack_number,time_delta,label
0,501989869,168430090,0,0,1,1297,0,104,0,0,0,0.0,attack
1,2686355289,168430090,0,0,1,391,0,98,0,0,0,2.6e-05,attack
2,1030133746,168430090,0,0,1,810,0,45,0,0,0,9e-06,attack
3,18314794,168430090,0,0,1,1475,0,66,0,0,0,4e-06,attack
4,103709945,168430090,0,0,1,694,0,90,0,0,0,8e-05,attack


Perfect! Now onto actual machine learning in ...